string_int.c 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. #include <stdlib.h>
  2. #include <string.h>
  3. #include <stdint.h>
  4. #include <assert.h>
  5. #define MURMUR_SEED 718281828
  6. #include "hash.h"
  7. #include "hash_fns/MurmurHash3.h"
  8. #include "string_int.h"
  9. typedef struct memory_chunk {
  10. char* data;
  11. size_t len;
  12. size_t fill;
  13. } memory_chunk_t;
  14. typedef struct hash_bucket {
  15. uint64_t hash;
  16. char* value;
  17. } hash_bucket_t;
  18. typedef struct string_internment_table {
  19. struct {
  20. hash_bucket_t* buckets;
  21. size_t alloc_size;
  22. size_t fill;
  23. } ht;
  24. int pool_alloc;
  25. int pool_fill;
  26. memory_chunk_t* pool;
  27. // memory_chunk_t* fill_head;
  28. } string_internment_table_t;
  29. struct string_internment_table* global_string_internment_table;
  30. static ptrdiff_t find_bucket_strint(string_internment_table_t* tab, uint64_t hash, char* key);
  31. static ptrdiff_t find_bucket_n_strint(string_internment_table_t* tab, uint64_t hash, char* key, size_t key_len);
  32. static int resize(string_internment_table_t* tab, size_t new_size);
  33. void string_internment_table_init(struct string_internment_table** ptab) {
  34. struct string_internment_table* tab = calloc(1, sizeof(*tab));
  35. *ptab = tab;
  36. tab->pool_alloc = 32;
  37. tab->pool_fill = 0;
  38. tab->pool = malloc(tab->pool_alloc * sizeof(*tab->pool));
  39. tab->ht.alloc_size = 1024;
  40. tab->ht.fill = 0;
  41. tab->ht.buckets = calloc(1, tab->ht.alloc_size * sizeof(*tab->ht.buckets));
  42. }
  43. static char* intern_string(struct string_internment_table* tab, char* s, int64_t len) {
  44. char* slot;
  45. memory_chunk_t* p = tab->pool;
  46. for(int i = 0; i < tab->pool_fill; i++, p++) {
  47. if(p->len - p->fill > len) {
  48. // found a spot;
  49. slot = p->data + p->fill;
  50. memcpy(slot, s, len);
  51. slot[len] = 0;
  52. p->fill += len + 1;
  53. return slot;
  54. }
  55. }
  56. //none of the chunks have enough room. add a new chunk.
  57. if(tab->pool_fill >= tab->pool_alloc) {
  58. tab->pool_alloc *= 2;
  59. tab->pool = realloc(tab->pool, tab->pool_alloc * sizeof(*tab->pool));
  60. }
  61. p = &tab->pool[tab->pool_fill];
  62. p->len = 1024 * 1024 * 8;
  63. p->fill = 0;
  64. p->data = malloc(p->len * sizeof(*p->data));
  65. tab->pool_fill++;
  66. slot = p->data + p->fill;
  67. memcpy(slot, s, len);
  68. slot[len] = 0;
  69. p->fill += len + 1;
  70. return slot;
  71. }
  72. // returns a pointer to the permanent unique string
  73. char* strint_(struct string_internment_table* tab, char* s) {
  74. // returns a pointer to the permanent unique string
  75. return strnint_(tab, s, strlen(s));
  76. }
  77. // returns a pointer to the permanent unique string
  78. char* strnint_(struct string_internment_table* tab, char* s, size_t slen) {
  79. uint64_t hash[2];
  80. int64_t bi;
  81. char* ps;
  82. MurmurHash3_x64_128(s, slen, MURMUR_SEED, hash);
  83. bi = find_bucket_n_strint(tab, hash[0], s, slen);
  84. if(tab->ht.buckets[bi].value) {
  85. // found it already, bail early
  86. return tab->ht.buckets[bi].value;
  87. }
  88. // add the string into the table
  89. // check size and grow if necessary
  90. if((float)tab->ht.fill / (float)tab->ht.alloc_size >= 0.75) {
  91. resize(tab, tab->ht.alloc_size * 2);
  92. bi = find_bucket_n_strint(tab, hash[0], s, slen);
  93. }
  94. ps = intern_string(tab, s, slen);
  95. tab->ht.buckets[bi].hash = hash[0];
  96. tab->ht.buckets[bi].value = ps;
  97. tab->ht.fill++;
  98. return ps;
  99. }
  100. // should always be called with a power of two
  101. static int resize(string_internment_table_t* tab, size_t new_size) {
  102. hash_bucket_t* buckets, *old, *op;
  103. int64_t old_len = tab->ht.alloc_size;
  104. int64_t i, n, bi;
  105. old = tab->ht.buckets;
  106. buckets = calloc(1, new_size * sizeof(*buckets));
  107. tab->ht.buckets = buckets;
  108. tab->ht.alloc_size = new_size;
  109. for(i = 0, n = 0; i < old_len && n < tab->ht.fill; i++) {
  110. op = &old[i];
  111. if(op->value == NULL) {
  112. continue;
  113. }
  114. n++;
  115. bi = find_bucket_strint(tab, op->hash, op->value);
  116. buckets[bi] = *op;
  117. }
  118. free(old);
  119. return 0;
  120. }
  121. static ptrdiff_t find_bucket_strint(string_internment_table_t* tab, uint64_t hash, char* key) {
  122. int64_t startBucket, bi;
  123. bi = hash % tab->ht.alloc_size;
  124. startBucket = bi;
  125. do {
  126. hash_bucket_t* bucket = tab->ht.buckets + bi;
  127. // empty bucket
  128. if(bucket->value == NULL) {
  129. return bi;
  130. }
  131. if(bucket->hash == hash) {
  132. if(!strcmp(key, bucket->value)) {
  133. // bucket is the right one and contains a value already
  134. return bi;
  135. }
  136. // collision, probe next bucket
  137. }
  138. bi = (bi + 1) % tab->ht.alloc_size;
  139. } while(bi != startBucket);
  140. // should never reach here if the table is maintained properly
  141. // fprintf(stderr, "Invalid bucket -1 in strint\n");
  142. assert(0);
  143. return -1;
  144. }
  145. static ptrdiff_t find_bucket_n_strint(string_internment_table_t* tab, uint64_t hash, char* key, size_t key_len) {
  146. int64_t startBucket, bi;
  147. bi = hash % tab->ht.alloc_size;
  148. startBucket = bi;
  149. do {
  150. hash_bucket_t* bucket = tab->ht.buckets + bi;
  151. // printf("%ld v: %s\n", bi, bucket->value);
  152. // empty bucket
  153. if(bucket->value == NULL) {
  154. return bi;
  155. }
  156. if(bucket->hash == hash) {
  157. if(!strncmp(key, bucket->value, key_len)) {
  158. // bucket is the right one and contains a value already
  159. return bi;
  160. }
  161. // collision, probe next bucket
  162. }
  163. bi = (bi + 1) % tab->ht.alloc_size;
  164. } while(bi != startBucket);
  165. // should never reach here if the table is maintained properly
  166. // fprintf(stderr, "Invalid bucket -1 in strint\n");
  167. assert(0);
  168. return -1;
  169. }