123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269 |
- #include <stdlib.h>
- #include <string.h>
- #include <stdint.h>
- #include <assert.h>
- #define MURMUR_SEED 718281828
- #include "hash.h"
- #include "hash_fns/MurmurHash3.h"
- #include "string_int.h"
- typedef struct memory_chunk {
- char* data;
- size_t len;
- size_t fill;
- } memory_chunk_t;
- typedef struct hash_bucket {
- uint64_t hash;
- char* value;
- } hash_bucket_t;
- typedef struct string_internment_table {
-
- struct {
- hash_bucket_t* buckets;
- size_t alloc_size;
- size_t fill;
- } ht;
-
- int pool_alloc;
- int pool_fill;
- memory_chunk_t* pool;
- // memory_chunk_t* fill_head;
-
- } string_internment_table_t;
- struct string_internment_table* global_string_internment_table;
- static ptrdiff_t find_bucket_strint(string_internment_table_t* tab, uint64_t hash, char* key);
- static ptrdiff_t find_bucket_n_strint(string_internment_table_t* tab, uint64_t hash, char* key, size_t key_len);
- static int resize(string_internment_table_t* tab, size_t new_size);
- void string_internment_table_init(struct string_internment_table** ptab) {
- struct string_internment_table* tab = calloc(1, sizeof(*tab));
- *ptab = tab;
-
- tab->pool_alloc = 32;
- tab->pool_fill = 0;
- tab->pool = malloc(tab->pool_alloc * sizeof(*tab->pool));
-
-
- tab->ht.alloc_size = 1024;
- tab->ht.fill = 0;
- tab->ht.buckets = calloc(1, tab->ht.alloc_size * sizeof(*tab->ht.buckets));
- }
- static char* intern_string(struct string_internment_table* tab, char* s, int64_t len) {
- char* slot;
-
- memory_chunk_t* p = tab->pool;
-
- for(int i = 0; i < tab->pool_fill; i++, p++) {
-
- if(p->len - p->fill > len) {
- // found a spot;
- slot = p->data + p->fill;
- memcpy(slot, s, len);
- slot[len] = 0;
-
- p->fill += len + 1;
-
- return slot;
- }
-
- }
-
- //none of the chunks have enough room. add a new chunk.
- if(tab->pool_fill >= tab->pool_alloc) {
- tab->pool_alloc *= 2;
- tab->pool = realloc(tab->pool, tab->pool_alloc * sizeof(*tab->pool));
- }
-
- p = &tab->pool[tab->pool_fill];
- p->len = 1024 * 1024 * 8;
- p->fill = 0;
- p->data = malloc(p->len * sizeof(*p->data));
-
- tab->pool_fill++;
-
- slot = p->data + p->fill;
- memcpy(slot, s, len);
- slot[len] = 0;
-
- p->fill += len + 1;
-
- return slot;
- }
- // returns a pointer to the permanent unique string
- char* strint_(struct string_internment_table* tab, char* s) {
- // returns a pointer to the permanent unique string
- return strnint_(tab, s, strlen(s));
- }
- // returns a pointer to the permanent unique string
- char* strnint_(struct string_internment_table* tab, char* s, size_t slen) {
- uint64_t hash[2];
- int64_t bi;
- char* ps;
-
- MurmurHash3_x64_128(s, slen, MURMUR_SEED, hash);
-
- bi = find_bucket_n_strint(tab, hash[0], s, slen);
-
- if(tab->ht.buckets[bi].value) {
- // found it already, bail early
- return tab->ht.buckets[bi].value;
- }
-
- // add the string into the table
-
- // check size and grow if necessary
- if((float)tab->ht.fill / (float)tab->ht.alloc_size >= 0.75) {
- resize(tab, tab->ht.alloc_size * 2);
- bi = find_bucket_n_strint(tab, hash[0], s, slen);
- }
-
- ps = intern_string(tab, s, slen);
-
- tab->ht.buckets[bi].hash = hash[0];
- tab->ht.buckets[bi].value = ps;
- tab->ht.fill++;
-
- return ps;
- }
- // should always be called with a power of two
- static int resize(string_internment_table_t* tab, size_t new_size) {
- hash_bucket_t* buckets, *old, *op;
- int64_t old_len = tab->ht.alloc_size;
- int64_t i, n, bi;
-
- old = tab->ht.buckets;
-
- buckets = calloc(1, new_size * sizeof(*buckets));
- tab->ht.buckets = buckets;
- tab->ht.alloc_size = new_size;
-
- for(i = 0, n = 0; i < old_len && n < tab->ht.fill; i++) {
-
- op = &old[i];
-
- if(op->value == NULL) {
- continue;
- }
-
- n++;
- bi = find_bucket_strint(tab, op->hash, op->value);
- buckets[bi] = *op;
- }
-
- free(old);
-
- return 0;
- }
- static ptrdiff_t find_bucket_strint(string_internment_table_t* tab, uint64_t hash, char* key) {
- int64_t startBucket, bi;
-
- bi = hash % tab->ht.alloc_size;
- startBucket = bi;
-
- do {
-
- hash_bucket_t* bucket = tab->ht.buckets + bi;
-
- // empty bucket
- if(bucket->value == NULL) {
- return bi;
- }
-
- if(bucket->hash == hash) {
- if(!strcmp(key, bucket->value)) {
- // bucket is the right one and contains a value already
- return bi;
- }
-
- // collision, probe next bucket
- }
-
- bi = (bi + 1) % tab->ht.alloc_size;
- } while(bi != startBucket);
-
- // should never reach here if the table is maintained properly
- // fprintf(stderr, "Invalid bucket -1 in strint\n");
- assert(0);
-
- return -1;
- }
- static ptrdiff_t find_bucket_n_strint(string_internment_table_t* tab, uint64_t hash, char* key, size_t key_len) {
- int64_t startBucket, bi;
-
- bi = hash % tab->ht.alloc_size;
- startBucket = bi;
-
-
- do {
- hash_bucket_t* bucket = tab->ht.buckets + bi;
-
-
- // printf("%ld v: %s\n", bi, bucket->value);
-
- // empty bucket
- if(bucket->value == NULL) {
- return bi;
- }
-
- if(bucket->hash == hash) {
- if(!strncmp(key, bucket->value, key_len)) {
- // bucket is the right one and contains a value already
- return bi;
- }
-
- // collision, probe next bucket
- }
-
- bi = (bi + 1) % tab->ht.alloc_size;
- } while(bi != startBucket);
-
- // should never reach here if the table is maintained properly
- // fprintf(stderr, "Invalid bucket -1 in strint\n");
- assert(0);
-
- return -1;
- }
|