preprocess.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <ctype.h>
  4. #include "sti/sti.h"
  5. #include "tree.h"
  6. #define double float
  7. char* strtolower(char* s) {
  8. for(char* x = s; *x; x++) *x = tolower(*x);
  9. return s;
  10. }
  11. typedef struct namestats {
  12. char* name;
  13. long cnt;
  14. char gender; // m,f
  15. } namestats;
  16. double nrand() {
  17. return (double)rand() / (double)RAND_MAX;
  18. }
  19. typedef struct {
  20. double total;
  21. double total_pairs;
  22. double total_triplets;
  23. double lengths[15];
  24. double positions[15][26];
  25. double positions_totals[15];
  26. double pairs[26][26];
  27. double pairs_totals[26];
  28. double triplets[26][26][26];
  29. double triplets_totals[26][26];
  30. double quads[26][26][26][26];
  31. double quads_totals[26][26][26];
  32. double quints[26][26][26][26][26];
  33. double quints_totals[26][26][26][26];
  34. seqtree* prefix;
  35. seqtree* internal;
  36. seqtree* suffix;
  37. } letter_stats;
  38. #define FOR(a, limit) for(long a = 0; a < limit; a++)
  39. char letter_2(letter_stats* ls, char prev) {
  40. double letter_p = nrand();
  41. double sum = 0;
  42. FOR(a, 26) {
  43. sum += ls->pairs[(int)prev - 'a'][a];
  44. if(sum >= letter_p) {
  45. return (char)a + 'a';
  46. }
  47. }
  48. return 'z';
  49. }
  50. char letter_3(letter_stats* ls, char minus2, char minus1) {
  51. double letter_p = nrand();
  52. double sum = 0;
  53. FOR(a, 26) {
  54. sum += ls->triplets[(int)minus2 - 'a'][(int)minus1 - 'a'][a];
  55. // printf("%d, %d sum: %f of %f\n", minus2, minus1, sum, letter_p);
  56. if(sum >= letter_p) {
  57. return (char)a + 'a';
  58. }
  59. }
  60. return -1;
  61. }
  62. char letter_4(letter_stats* ls, char minus3, char minus2, char minus1) {
  63. double letter_p = nrand();
  64. double sum = 0;
  65. FOR(a, 26) {
  66. sum += ls->quads[(int)minus3 - 'a'][(int)minus2 - 'a'][(int)minus1 - 'a'][a];
  67. // printf("%d, %d sum: %f of %f\n", minus2, minus1, sum, letter_p);
  68. if(sum >= letter_p) {
  69. return (char)a + 'a';
  70. }
  71. }
  72. return -1;
  73. }
  74. char letter_5(letter_stats* ls, char minus4, char minus3, char minus2, char minus1) {
  75. double letter_p = nrand();
  76. double sum = 0;
  77. FOR(a, 26) {
  78. sum += ls->quints[(int)minus4 - 'a'][(int)minus3 - 'a'][(int)minus2 - 'a'][(int)minus1 - 'a'][a];
  79. // printf("%d, %d sum: %f of %f\n", minus2, minus1, sum, letter_p);
  80. if(sum >= letter_p) {
  81. return (char)a + 'a';
  82. }
  83. }
  84. return -1;
  85. }
  86. char positional(letter_stats* ls, int i) {
  87. double letter_p = nrand();
  88. double sum = 0;
  89. FOR(a, 26) {
  90. sum += ls->positions[i][a];
  91. if(sum >= letter_p) {
  92. return (char)a + 'a';
  93. break;
  94. }
  95. }
  96. return 'z';
  97. }
  98. int main(int argc, char* argv[]) {
  99. srand(argc > 1 ? atoi(argv[1]) : 0);
  100. HT(namestats*) m_stats;
  101. HT(namestats*) f_stats;
  102. HT_init(&m_stats, 4096*8);
  103. HT_init(&f_stats, 4096*8);
  104. rglob gn_files;
  105. recursive_glob("./data/", "yob*.txt", 0, &gn_files);
  106. for(int fi = 0; fi < gn_files.len; fi++) {
  107. size_t line_cnt;
  108. char* src = read_whole_file(gn_files.entries[fi].full_path, NULL);
  109. char** lines = strsplit_inplace(src, '\n', &line_cnt);
  110. for(char** line = lines; *line; line++) {
  111. char* e = strchr(*line, ',');
  112. char* name = strndup(*line, e - *line);
  113. *line = e + 1;
  114. int female = **line == 'F';
  115. long cnt = strtol(*line + 2, NULL, 10);
  116. // printf("'%s', female? %d, cnt: %ld\n", name, female, cnt);
  117. namestats* p = NULL;
  118. if(!female) {
  119. if(HT_get(&m_stats, name, &p)) {
  120. p = calloc(1, sizeof(*p));
  121. p->name = strtolower(strdup(name));
  122. p->gender = 'm';
  123. HT_set(&m_stats, name, p);
  124. }
  125. }
  126. else {
  127. if(HT_get(&f_stats, name, &p)) {
  128. p = calloc(1, sizeof(*p));
  129. p->name = strtolower(strdup(name));
  130. p->gender = 'f';
  131. HT_set(&f_stats, name, p);
  132. }
  133. }
  134. p->cnt += cnt;
  135. free(name);
  136. }
  137. // printf("%s: %ld lines\n", gn_files.entries[fi].file_name, line_cnt);
  138. free(lines);
  139. free(src);
  140. // break;
  141. }
  142. if(gn_files.entries) free(gn_files.entries);
  143. letter_stats* ls = calloc(1, sizeof(*ls));
  144. // memset(&ls, 0, sizeof(ls));
  145. ls->prefix = calloc(1, sizeof(*ls->prefix));
  146. ls->internal = calloc(1, sizeof(*ls->internal));
  147. ls->suffix = calloc(1, sizeof(*ls->suffix));
  148. HT_EACH(&m_stats, name, namestats*, st) {
  149. // if(st->cnt == 1337 || st->cnt == 31337 || st->cnt == 42069) printf("%s: %ld\n", st->name, st->cnt);
  150. long l = strlen(st->name);
  151. FOR(i, l) {
  152. sq_insert_seq(ls->prefix, st->name, i + 1, st->cnt);
  153. FOR(n, l - i) {
  154. sq_insert_seq(ls->internal, st->name + i, n + 1, st->cnt);
  155. }
  156. if(l >= 4) {
  157. sq_insert_seq(ls->suffix, st->name + l - 4, 4, st->cnt);
  158. }
  159. }
  160. /*
  161. for(char* s = st->name; s[0] && s[1]; s++) {
  162. ls->pairs[s[0] - 'a'][s[1] - 'a'] += st->cnt;
  163. ls->pairs_totals[s[0] - 'a'] += st->cnt;
  164. ls->total_pairs += st->cnt;
  165. }
  166. for(char* s = st->name; s[0] && s[1] && s[2]; s++) {
  167. ls->triplets[s[0] - 'a'][s[1] - 'a'][s[2] - 'a'] += st->cnt;
  168. ls->triplets_totals[s[0] - 'a'][s[1] - 'a'] += st->cnt;
  169. }
  170. for(char* s = st->name; s[0] && s[1] && s[2] && s[3]; s++) {
  171. ls->quads[s[0] - 'a'][s[1] - 'a'][s[2] - 'a'][s[3] - 'a'] += st->cnt;
  172. ls->quads_totals[s[0] - 'a'][s[1] - 'a'][s[2] - 'a'] += st->cnt;
  173. }
  174. for(char* s = st->name; s[0] && s[1] && s[2] && s[3] && s[4]; s++) {
  175. ls->quints[s[0] - 'a'][s[1] - 'a'][s[2] - 'a'][s[3] - 'a'][s[4] - 'a'] += st->cnt;
  176. ls->quints_totals[s[0] - 'a'][s[1] - 'a'][s[2] - 'a'][s[3] - 'a'] += st->cnt;
  177. }
  178. for(int i = 0; st->name[i]; i++) {
  179. ls->positions[i][st->name[i] - 'a'] += st->cnt;
  180. ls->positions_totals[i] += st->cnt;
  181. }
  182. */
  183. ls->lengths[l] += st->cnt;
  184. ls->total += st->cnt;
  185. // minlen = fmin(strlen(st->name), minlen);
  186. // maxlen = fmax(strlen(st->name), maxlen);
  187. }
  188. sq_total_and_invert(ls->prefix);
  189. sq_total_and_invert(ls->internal);
  190. sq_total_and_invert(ls->suffix);
  191. // double inv_pairs = 1.0 / total_pairs;
  192. /*
  193. double inv_total = 1.0 / ls->total;
  194. for(int i = 0; i < 15; i++) {
  195. ls->lengths[i] *= inv_total;
  196. double inv = 1.0 / ls->positions_totals[i];
  197. for(int a = 0; a < 26; a++) {
  198. ls->positions[i][a] *= inv;
  199. }
  200. }
  201. int full_q = 0;
  202. int empty_q = 0;
  203. for(int a = 0; a < 26; a++) {
  204. double inv = 1.0 / ls->pairs_totals[a];
  205. for(int b = 0; b < 26; b++) {
  206. ls->pairs[a][b] *= inv;
  207. double inv_tri = 1.0 / ls->triplets_totals[a][b];
  208. FOR(c, 26) {
  209. ls->triplets[a][b][c] *= inv_tri;
  210. double inv_quad = 1.0 / ls->quads_totals[a][b][c];
  211. FOR(d, 26) {
  212. ls->quads[a][b][c][d] *= inv_quad;
  213. double inv_quint = 1.0 / ls->quints_totals[a][b][c][d];
  214. FOR(e, 26) {
  215. if(ls->quints[a][b][c][d][e] > 0) full_q++;
  216. else empty_q++;
  217. ls->quints[a][b][c][d][e] *= inv_quint;
  218. }
  219. }
  220. }
  221. // printf("%c%c: %f\n", a + 'a', b + 'a', ls.pairs[a][b]);
  222. }
  223. }
  224. printf("full/empty: %d, %d (%.2f%%)\n\n", full_q, empty_q, (full_q * 100. / empty_q));
  225. */
  226. /*
  227. FOR(x, 20) {
  228. double genlen_p = nrand();
  229. int genlen = 0;
  230. double sum = 0;
  231. FOR(i, 15) {
  232. sum += ls->lengths[i];
  233. if(sum >= genlen_p) {
  234. genlen = i;
  235. break;
  236. }
  237. // printf("%f\n", sum);
  238. }
  239. char name[15];
  240. name[0] = positional(ls, 0);
  241. printf("%c", name[0] + 'A' - 'a');
  242. name[1] = letter_2(ls, name[0]);
  243. printf("%c", name[1]);
  244. name[2] = letter_3(ls, name[0], name [1]);
  245. printf("%c", name[2]);
  246. name[3] = letter_4(ls, name[0], name [1], name [2]);
  247. printf("%c", name[3]);
  248. for(int i = 4; i < genlen; i++) {
  249. name[i] = letter_5(ls, name[i - 4], name[i - 3], name[i - 2], name[i - 1]);
  250. if(name[i] < 0) {
  251. break;
  252. name[i] = letter_4(ls, name[i - 3], name[i - 2], name[i - 1]);
  253. if(name[i] < 0) {
  254. // break;
  255. name[i] = letter_3(ls, name[i - 2], name[i - 1]);
  256. if(name[i] < 0) break;
  257. }
  258. }
  259. printf("%c", name[i]);
  260. }
  261. printf(" - %d, %.15f\n", genlen, genlen_p);
  262. }
  263. printf("\nmemory used: %ld mb\n", sizeof(*ls) / (1024*1024));
  264. */
  265. // sq_print_tree(ls->suffix, 0);
  266. FOR(x, 20) {
  267. char name[15];
  268. name[0] = sq_sample(ls->prefix);
  269. printf("%c", name[0] + 'A' - 'a');
  270. name[1] = sq_sample_depth(ls->prefix, name, 1);
  271. printf("%c", name[1]);
  272. for(int i = 2; i < 12; i++) {
  273. int n = fmin(i, 3);
  274. name[i] = sq_sample_depth(ls->internal, name + i - n, n);
  275. if(name[i] < 0) {
  276. printf(" -");
  277. break;
  278. }
  279. if(i >= 4) {
  280. float quit_chance = sq_prob_depth(ls->suffix, name + i - 4, 4);
  281. if(quit_chance > frandNorm() * 3.6 * (i / 15.)) {
  282. printf(" + %f", quit_chance);
  283. break;
  284. }
  285. }
  286. printf("%c", name[i]);
  287. }
  288. printf("\n");
  289. }
  290. printf("\n");
  291. printf("prefix memory used: %ld mb\n", sq_memstats(ls->prefix) / (1024*1024));
  292. printf("internal memory used: %ld mb\n", sq_memstats(ls->internal) / (1024*1024));
  293. printf("suffix memory used: %ld kb\n", sq_memstats(ls->suffix) / (1*1024));
  294. }