parser_gen.c 24 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. // Public Domain
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <string.h>
  5. #include <stdint.h>
  6. #include <ctype.h>
  7. #include <unistd.h>
  8. #include "../fs.h"
  9. #include "../vec.h"
  10. #include "../string.h"
  11. #include "../hash.h"
  12. int has_case_conflict = 0;
  13. char* exchg[] = {
  14. ['!'] = "bang",
  15. ['%'] = "pct",
  16. ['['] = "lbracket",
  17. [']'] = "rbracket",
  18. ['{'] = "lbrace",
  19. ['}'] = "rbrace",
  20. ['('] = "lparen",
  21. [')'] = "rparen",
  22. ['^'] = "caret",
  23. ['#'] = "pound",
  24. ['@'] = "at",
  25. ['~'] = "tilde",
  26. ['&'] = "amp",
  27. ['$'] = "dollar",
  28. ['`'] = "grave",
  29. ['\''] = "squote",
  30. ['"'] = "dquote",
  31. [':'] = "colon",
  32. [';'] = "semi",
  33. ['?'] = "quest",
  34. ['.'] = "dot",
  35. [','] = "comma",
  36. ['<'] = "lt",
  37. ['>'] = "gt",
  38. ['+'] = "plus",
  39. ['='] = "eq",
  40. ['_'] = "under",
  41. ['-'] = "dash",
  42. ['|'] = "pipe",
  43. ['\\'] = "bslash",
  44. ['/'] = "slash",
  45. ['*'] = "star",
  46. };
  47. typedef struct charset {
  48. char* name;
  49. char* raw;
  50. char* table;
  51. int minval;
  52. int maxval;
  53. int len;
  54. } charset;
  55. static void build_charset(charset* cs) {
  56. cs->minval = 9999999;
  57. cs->maxval = -9999999;
  58. for(int i = 0; cs->raw[i] != 0; i++) {
  59. if(cs->raw[i] < cs->minval) cs->minval = cs->raw[i];
  60. if(cs->raw[i] > cs->maxval) cs->maxval = cs->raw[i];
  61. }
  62. cs->len = cs->maxval - cs->minval;
  63. cs->table = calloc(1, sizeof(*cs->table) * cs->len);
  64. for(int i = 0; cs->raw[i] != 0; i++) {
  65. cs->table[(int)cs->raw[i] - cs->minval] = 1;
  66. }
  67. }
  68. int charset_has(charset* cs, int c) {
  69. int i = c - cs->minval;
  70. if(i > cs->maxval || i < 0) return 0;
  71. return cs->table[i];
  72. }
  73. typedef struct {
  74. int type; // 0 = char, 1 = charset, 2 = invchar, 3 = invcharset
  75. int action; // > each char, ^ discard char
  76. int invert;
  77. int c;
  78. char* cs_name;
  79. charset* cset;
  80. char* dest_state;
  81. } state_case_info;
  82. typedef struct strpair {
  83. char* key;
  84. char* val;
  85. } strpair;
  86. typedef struct state_info {
  87. char* name;
  88. VEC(state_case_info) c_cases;
  89. VEC(state_case_info) cs_cases;
  90. struct state_info* fail_to;
  91. char* retry_as;
  92. char* retry_as_cs_name;
  93. char is_terminal;
  94. VEC(strpair) terminal_data;
  95. } state_info;
  96. state_info* new_state_info(char* name) {
  97. state_info* si = calloc(1, sizeof(*si));
  98. si->name = strdup(name);
  99. return si;
  100. }
  101. state_case_info* add_case_char(state_info* si, int c, int invert, char* dest) {
  102. VEC_EACH(&si->c_cases, i, ci) {
  103. if(ci.type == 0 && ci.c == c && ci.invert == invert) {
  104. if(0 != strcmp(dest, ci.dest_state)) {
  105. printf("case conflict: %s (%c)->%s / (%c)->%s\n", si->name, ci.c, ci.dest_state, c, dest);
  106. has_case_conflict = 1;
  107. return NULL;
  108. }
  109. return &VEC_ITEM(&si->c_cases, i);
  110. }
  111. }
  112. VEC_INC(&si->c_cases);
  113. state_case_info* nci = &VEC_TAIL(&si->c_cases);
  114. nci->dest_state = strdup(dest);
  115. nci->type = 0;
  116. nci->c = c;
  117. nci->action = 0; // consume character
  118. nci->invert = invert;
  119. return nci;
  120. }
  121. state_case_info* add_case_cset(state_info* si, char* cset, char* dest, int action, int invert) {
  122. VEC_EACH(&si->cs_cases, i, ci) {
  123. if(ci.type == 1 && 0 == strcmp(ci.cs_name, cset)) {
  124. if(0 != strcmp(dest, ci.dest_state)) {
  125. printf("case conflict: %s [%s]->%s / [%s]->%s\n", si->name, ci.cs_name, ci.dest_state, cset, dest);
  126. return NULL;
  127. }
  128. return &VEC_ITEM(&si->cs_cases, i);
  129. }
  130. }
  131. VEC_INC(&si->cs_cases);
  132. state_case_info* nci = &VEC_TAIL(&si->cs_cases);
  133. nci->dest_state = strdup(dest);
  134. nci->type = 1;
  135. nci->cs_name = strdup(cset);
  136. nci->action = action;
  137. nci->invert = invert;
  138. return nci;
  139. }
  140. typedef struct tcontext {
  141. char* prefix;
  142. // tree-walking buffer
  143. char* fbuffer;
  144. char* buffer;
  145. size_t alloc;
  146. size_t len;
  147. int level;
  148. size_t tlen;
  149. // categorized state names
  150. VEC(state_info*) terminals;
  151. VEC(state_info*) internals;
  152. HT(state_info*) states;
  153. HT(charset*) csets;
  154. } tcontext;
  155. #define MAX(a,b) ((a) > (b) ? (a) : (b))
  156. static char* state_name(char* pre, char* r, int next) {
  157. char* b = malloc(4096);
  158. strcpy(b, pre);
  159. int len = strlen(pre);
  160. int last_was_an = 1;
  161. b[len++] = '_';
  162. b[len++] = '_';
  163. for(int i = 0; r[i] != 0; i++) {
  164. int rc = (int)r[i];
  165. if(isalnum(r[i])) {
  166. if(!last_was_an) {
  167. b[len++] = '_';
  168. }
  169. b[len++] = r[i];
  170. last_was_an = 1;
  171. continue;
  172. }
  173. if(exchg[rc]) {
  174. // if(last_was_an) {
  175. b[len++] = '_';
  176. // }
  177. strcpy(b + len, exchg[rc]);
  178. len += strlen(exchg[rc]);
  179. last_was_an = 0;
  180. continue;
  181. }
  182. printf("unhandleable char: %d\n", r[i]);
  183. }
  184. if(next == 0) {
  185. // nothin
  186. }
  187. else if(isalnum(next)) {
  188. if(!last_was_an) {
  189. b[len++] = '_';
  190. }
  191. b[len++] = next;
  192. }
  193. else if(exchg[next]) {
  194. b[len++] = '_';
  195. strcpy(b + len, exchg[next]);
  196. len += strlen(exchg[next]);
  197. }
  198. else {
  199. printf("unhandleable char: %d\n", next);
  200. }
  201. b[len] = 0;
  202. return b;
  203. }
  204. state_info* induce_state(char* name, tcontext* ctx) {
  205. state_info* s;
  206. if(HT_get(&ctx->states, name, (void*)&s)) {
  207. s = new_state_info(name);
  208. HT_set(&ctx->states, strdup(name), s); // BUG fix needing to strdup the name
  209. }
  210. return s;
  211. }
  212. typedef struct case_list {
  213. VEC(state_case_info) cases;
  214. } case_list;
  215. // expand a word into a set of transitions
  216. // returns the final state
  217. state_info* expand_word(
  218. char* word,
  219. state_info* base,
  220. case_list* extra,
  221. char* retry_as,
  222. char* retry_as_cs_name,
  223. tcontext* ctx,
  224. char char_i_cmp) {
  225. size_t blen = 0;
  226. size_t balloc = 256;
  227. char* buffer = malloc(balloc * sizeof(*buffer));
  228. char* prev_name = base->name;
  229. state_info* prev_st = base;
  230. for(int i = 0; word[i] != 0; i++) {
  231. int c = word[i];
  232. if(c == '\\') {
  233. i++;
  234. c = word[i];
  235. if(c == 't') c = '\t';
  236. else if(c == 'r') c = '\r';
  237. else if(c == 'n') c = '\n';
  238. else if(c == 'v') c = '\v';
  239. else if(c == '\\') c = '\\';
  240. else if(c == 0) {
  241. // malformed word
  242. break;
  243. }
  244. }
  245. // collect up new name
  246. buffer[blen++] = c;
  247. buffer[blen] = 0;
  248. char* this_name = state_name(base->name, buffer, 0);
  249. // printf("thisname: '%s'\n", this_name);
  250. state_info* s = induce_state(this_name, ctx);
  251. // s->is_terminal = n->is_terminal;
  252. if(retry_as) s->retry_as = strdup(retry_as);
  253. if(retry_as_cs_name) s->retry_as_cs_name = strdup(retry_as_cs_name);
  254. // add the transitions
  255. state_case_info* ci = add_case_char(prev_st, c, 0, this_name);
  256. (void)ci;
  257. state_case_info* cii = NULL;
  258. int cc = c;
  259. if(char_i_cmp && (c >= 'A' && c <= 'Z')) {
  260. cc = c + 32;
  261. cii = add_case_char(prev_st, cc, 0, this_name);
  262. } else if(char_i_cmp && (c >= 'a' && c <= 'z')) {
  263. cc = c - 32;
  264. cii = add_case_char(prev_st, cc, 0, this_name);
  265. }
  266. (void)cii;
  267. VEC_EACH(&extra->cases, i, cs) {
  268. add_case_cset(s, cs.cs_name, cs.dest_state, cs.action, cs.invert);
  269. }
  270. //
  271. // next
  272. prev_st = s;
  273. prev_name = this_name;
  274. (void)prev_name;
  275. }
  276. // the last state is a terminal state
  277. prev_st->is_terminal = 1;
  278. // TODO make the last statehave inverted logic for retry-as
  279. return prev_st;
  280. }
  281. // buff must be at least as long as it needs to be
  282. static void pretty_print_char(char* buf, int c) {
  283. #define ESC_CHAR(x) \
  284. buf[0] = '\''; \
  285. buf[1] = '\\'; \
  286. buf[2] = x; \
  287. buf[3] = '\''; \
  288. buf[4] = 0; \
  289. break;
  290. switch(c) {
  291. case '\t': ESC_CHAR('t');
  292. case '\r': ESC_CHAR('r');
  293. case '\n': ESC_CHAR('n');
  294. case '\v': ESC_CHAR('v');
  295. case '\'': ESC_CHAR('\'');
  296. case '\\': ESC_CHAR('\\');
  297. default:
  298. if(isgraph(c)) {
  299. buf[0] = '\'';
  300. buf[1] = c;
  301. buf[2] = '\'';
  302. buf[3] = 0;
  303. }
  304. else {
  305. sprintf(buf, "0x%x", c);
  306. }
  307. }
  308. }
  309. static void print_state_switch(state_info* si) {
  310. char buf[16];
  311. printf("\ncase %s:\n", si->name);
  312. if(VEC_LEN(&si->c_cases)) {
  313. printf("\tswitch(c) {\n");
  314. VEC_EACH(&si->c_cases, ckey, ci) {
  315. if(ci.invert) continue;
  316. pretty_print_char(buf, ci.c);
  317. char* action = "push_char_id";
  318. if(ci.action == 1) action = "push_char_done";
  319. else if(ci.action == 2) action = "done_zero_move";
  320. printf("\t\tcase %s: %s(%s);\n", buf, action, ci.dest_state);
  321. }
  322. printf("\t}\n");
  323. VEC_EACH(&si->c_cases, ckey, ci) {
  324. if(!ci.invert) continue;
  325. pretty_print_char(buf, ci.c);
  326. char* action = "push_char_id";
  327. if(ci.action == 1) action = "push_char_done";
  328. else if(ci.action == 2) action = "done_zero_move";
  329. printf("\tif(c != %s) { %s(%s); }\n", buf, action, ci.dest_state);
  330. }
  331. }
  332. if(VEC_LEN(&si->cs_cases)) {
  333. VEC_EACH(&si->cs_cases, ckey, ci) {
  334. if(si->retry_as && 0 == strcmp(si->retry_as, ci.cs_name)) continue;
  335. char* action = "push_char_id";
  336. if(ci.action == 1) action = "push_char_done";
  337. else if(ci.action == 2) action = "done_zero_move";
  338. char* inv = ci.invert ? "!" : "";
  339. printf("\tif(%scharset_has(cset_%s, c)) { %s(%s); }\n", inv, ci.cs_name, action, ci.dest_state);
  340. }
  341. }
  342. if(si->fail_to) {
  343. printf("push_char_id(%s);\n", si->fail_to);
  344. }
  345. else if(si->is_terminal) {
  346. if(si->retry_as) printf("\tif(charset_has(cset_%s, c)) { retry_as(%s); }\n", si->retry_as_cs_name, si->retry_as);
  347. printf("\tdone_zero_move(%s);\n", si->name);
  348. }
  349. else if(si->retry_as)
  350. printf("\tretry_as(%s);\n", si->retry_as);
  351. else
  352. printf("\tgoto ERROR;\n");
  353. }
  354. static char* word_end(char* s, size_t* n) {
  355. // TODO: suport escaped spaces
  356. char* end = strpbrk(s, " \n\t\r");
  357. if(!end) end = s + strlen(s);
  358. if(n) *n = end - s;
  359. return end;
  360. }
  361. static int state_sort_fn(void* a_, void* b_) {
  362. state_info** a = a_;
  363. state_info** b = b_;
  364. return strcmp((*a)->name, (*b)->name);
  365. }
  366. static int case_sort_fn(void* a_, void* b_) {
  367. state_case_info* a = a_;
  368. state_case_info* b = b_;
  369. return a->c - b->c;
  370. }
  371. static int case_cs_sort_fn(void* a_, void* b_) {
  372. state_case_info* a = a_;
  373. state_case_info* b = b_;
  374. return strcmp(a->cs_name, b->cs_name);
  375. }
  376. int main(int argc, char* argv[]) {
  377. char ac;
  378. char print_data = 0;
  379. char print_data_defs = 0;
  380. char print_unique_data = 0;
  381. char print_triplet_data = 0;
  382. char print_enums = 0;
  383. char print_enum_names = 0;
  384. char print_guards = 0;
  385. char print_macros = 0;
  386. char print_switch = 0;
  387. char print_csets = 0;
  388. char* enum_pattern = NULL;
  389. char* terminal_pattern = NULL;
  390. char* fname = NULL;
  391. char* prefix = "LST__";
  392. char* guard_prefix = "PARSER_INCLUDE";
  393. char* data_def_prefix = "STATE_DATA_DEF_";
  394. while((ac = getopt(argc, argv, "cdD:eE:fF:gG:mnsT:uv")) != -1) {
  395. switch(ac) {
  396. case 'c': print_csets = 1; break;
  397. case 'd': print_data = 1; break;
  398. case 'D':
  399. print_data = 1;
  400. data_def_prefix = optarg;
  401. break;
  402. case 'e': print_enums = 1; break;
  403. case 'f': print_data_defs = 1; break;
  404. case 'F':
  405. print_data_defs = 1;
  406. data_def_prefix = optarg;
  407. break;
  408. case 'g': print_guards = 1; break;
  409. case 'G':
  410. print_guards = 1;
  411. guard_prefix = optarg;
  412. break;
  413. case 'E':
  414. print_enums = 1;
  415. enum_pattern = optarg;
  416. break;
  417. case 'm': print_macros = 1; break;
  418. case 'n': print_enum_names = 1; break;
  419. case 's': print_switch = 1; break;
  420. case 'T':
  421. print_enums = 1;
  422. terminal_pattern = optarg;
  423. break;
  424. case 'u': print_unique_data = 1; break;
  425. case 'v': print_triplet_data = 1; break;
  426. /* TODO:
  427. combine like states into one fall-through case.
  428. default start state name
  429. */
  430. }
  431. }
  432. if(optind < argc) {
  433. fname = argv[optind];
  434. }
  435. (void)terminal_pattern;
  436. (void)enum_pattern;
  437. size_t flen;
  438. size_t max_len = 0;
  439. char* src = readWholeFile(fname, &flen);
  440. char** lines = strsplit_inplace(src, '\n', NULL);
  441. // node* root = new_node(0);
  442. // HashTable(state_info) states;
  443. // HT_init(&states, 1024);
  444. // HashTable(charset) csets;
  445. // HT_init(&csets, 64);
  446. tcontext* tctx = calloc(1, sizeof(*tctx));
  447. tctx->alloc = max_len + 1;
  448. tctx->fbuffer = malloc(tctx->alloc);
  449. tctx->prefix = prefix;
  450. strcpy(tctx->fbuffer, tctx->prefix);
  451. tctx->buffer = tctx->fbuffer + strlen(tctx->prefix);
  452. HT_init(&tctx->states, 1024);
  453. HT_init(&tctx->csets, 64);
  454. for(int i = 0; lines[i]; i++) {
  455. int c = lines[i][0];
  456. char* s, *end;
  457. size_t wl;
  458. char* state_prefix = NULL;//strdup("LST_NULL"); // TODO unhardcode
  459. s = lines[i];
  460. // printf("i: %d, '%c/%d'\n", i, *s, *s);
  461. if(c == 0) continue; // empty line
  462. if(c == '#') continue; // comments
  463. VEC(strpair) terminal_data;
  464. VEC_INIT(&terminal_data);
  465. // mark state as terminal
  466. if(c == '&') {
  467. s = lines[i] + 1;
  468. end = word_end(s, &wl);
  469. // the name
  470. char* sname = strndup(s, wl);
  471. state_info* si = induce_state(sname, tctx);
  472. si->is_terminal = 1;
  473. s = end;
  474. // save any terminal data
  475. while(*s) {
  476. while(*s && *s == ' ') s++;
  477. if(!*s || *s == '\r' || *s == '\n') break;
  478. if(*s == ':') { // token type
  479. char* td_key = NULL;
  480. char* td_val = NULL;
  481. s++;
  482. end = strpbrk(s, "= \n\t\r");
  483. if(!end) end = s + strlen(s);
  484. wl = end - s;
  485. td_val = strndup(s, wl);
  486. if(*end == '=') { // key pair
  487. td_key = td_val;
  488. s = end;
  489. end = word_end(++s, &wl);
  490. td_val = strndup(s, wl);
  491. }
  492. VEC_PUSH(&si->terminal_data, ((strpair){td_key, td_val}));
  493. s = end;
  494. }
  495. }
  496. free(sname);
  497. continue;
  498. }
  499. // character set declaration
  500. if(c == '[') {
  501. s = lines[i] + 1;
  502. end = word_end(s, &wl);
  503. // the name
  504. char* sname = strndup(s, wl);
  505. s = end;
  506. while(*s && *s == ' ') s++;
  507. end = word_end(s, &wl);
  508. // char set itself
  509. VEC(char) chars;
  510. VEC_INIT(&chars);
  511. while(*s && *s == ' ') s++;
  512. while(*s && !isspace(*s)) {
  513. if(s[0] == '\\') { // escape
  514. int ec = s[1];
  515. if(ec == 't') ec = '\t';
  516. else if(ec == 'r') ec = '\r';
  517. else if(ec == 'n') ec = '\n';
  518. else if(ec == 'v') ec = '\v';
  519. else if(ec == '\\') ec = '\\';
  520. VEC_PUSH(&chars, ec);
  521. s++;
  522. }
  523. else if(s[1] == '-') { // range
  524. char low = s[0];
  525. char high = s[2];
  526. for(char c = low; c <= high; c++) {
  527. VEC_PUSH(&chars, c);
  528. }
  529. s += 2;
  530. }
  531. else { // normal char
  532. VEC_PUSH(&chars, s[0]);
  533. }
  534. s++;
  535. }
  536. VEC_PUSH(&chars, 0);
  537. charset* cs = calloc(1, sizeof(*cs));
  538. cs->raw = strdup(chars.data);
  539. cs->name = sname;
  540. build_charset(cs);
  541. if(HT_set(&tctx->csets, sname, cs)) {
  542. printf("ht fail: '%s'\n", sname);
  543. }
  544. // printf("%s: '%s'\n", sname, chars.data);
  545. VEC_FREE(&chars);
  546. continue;
  547. }
  548. // state prefix
  549. if(c == ':') {
  550. end = word_end(++s, &wl);
  551. state_prefix = strndup(s, wl);
  552. s = end;
  553. while(*s && *s == ' ') s++;
  554. }
  555. else {
  556. state_prefix = strdup("LST_NULL");
  557. }
  558. state_info* pst = induce_state(state_prefix, tctx);
  559. // if(HT_get(&tctx->states, state_prefix, (void*)&pst)) {
  560. // pst = new_state_info(state_prefix);
  561. // HT_set(&tctx->states, state_prefix, pst);
  562. // }
  563. char* cached_word = NULL;
  564. char char_i_cmp = 0;
  565. // word
  566. // state_info* lst; // last state of the word
  567. if(*s == '{' || *s == '(') {
  568. end = strpbrk(lines[i] + 1, " \n\t\r");
  569. wl = end - lines[i] - 1;
  570. // printf(">%d %s\n", l, lines[i]+1);
  571. cached_word = strndup(s+1, wl);
  572. if(*s == '(') char_i_cmp = 1;
  573. // lst = expand_word(w, pst, tctx);
  574. // put the word in the tree
  575. // n = insert_word(pst->words, s + 1, wl);
  576. max_len = MAX(max_len, wl);
  577. // free(w);
  578. // check for various metadata
  579. s = end;
  580. while(*s && *s == ' ') s++;
  581. }
  582. // single char transition
  583. // else if(*s == '@') {
  584. // s = word_end(++s, &wl);
  585. // }
  586. char* retry_as = NULL;
  587. char* retry_as_cs_name = NULL;
  588. case_list extra;
  589. VEC_INIT(&extra.cases);
  590. while(*s) {
  591. int invert = 0;
  592. while(*s && *s == ' ') s++;
  593. if(!*s || *s == '\r' || *s == '\n') break;
  594. if(*s == '!') {
  595. s++;
  596. invert = 1;
  597. if(!*s) break;
  598. }
  599. if(*s == ':') { // token type
  600. char* td_key = NULL;
  601. char* td_val = NULL;
  602. s++;
  603. end = strpbrk(s, "= \n\t\r");
  604. if(!end) end = s + strlen(s);
  605. wl = end - s;
  606. td_val = strndup(s, wl);
  607. if(*end == '=') { // key pair
  608. td_key = td_val;
  609. s = end;
  610. end = word_end(++s, &wl);
  611. td_val = strndup(s, wl);
  612. }
  613. VEC_PUSH(&terminal_data, ((strpair){td_key, td_val}));
  614. s = end;
  615. continue;
  616. }
  617. // go-to
  618. if(*s == '>') {
  619. s++;
  620. end = word_end(s, &wl);
  621. char* fail_to = strndup(s, wl);
  622. pst->fail_to = fail_to;
  623. break;
  624. }
  625. // retry-as, the final failto
  626. if(*s == '|') {
  627. s++;
  628. end = strchr(s, '>');
  629. char* set_name = strndup(s, end - s);
  630. s = end + 1;
  631. end = word_end(s, &wl);
  632. end = word_end(s, &wl);
  633. retry_as = strndup(s, wl);
  634. retry_as_cs_name = set_name;
  635. s = end;
  636. continue;
  637. }
  638. // char fail-to
  639. if(*s == '@') {
  640. s++;
  641. int ec = *s;
  642. if(ec == '\\') {
  643. ec = *++s;
  644. if(ec == 't') ec = '\t';
  645. else if(ec == 'r') ec = '\r';
  646. else if(ec == 'n') ec = '\n';
  647. else if(ec == 'v') ec = '\v';
  648. // TODO: hex/unicode exscape
  649. // else the literal char
  650. }
  651. s++;
  652. int type = *s++;
  653. // the target state
  654. end = word_end(s, &wl);
  655. char* fail_to = strndup(s, wl);
  656. state_case_info* ci = add_case_char(pst, ec, invert, fail_to);
  657. if(type == '>') ci->action = 0;
  658. else if(type == '=') ci->action = 1;
  659. else if(type == '~') ci->action = 2;
  660. free(fail_to);
  661. s = end;
  662. continue;
  663. }
  664. // charset fail-to
  665. if(*s == '+') {
  666. s++;
  667. end = strpbrk(s, ">=~");
  668. char* set_name = strndup(s, end - s);
  669. int type = *end;
  670. s = end + 1;
  671. end = word_end(s, &wl);
  672. char* fail_to = strndup(s, wl);
  673. // TODO: look up lazily later
  674. // if(HT_get(&csets, set_name, (void*)&n->fail_charset)) {
  675. // printf("failed to get charset: %p '%s'\n", n->fail_charset, set_name);
  676. // }
  677. // TODO: state transition info
  678. int action = 0;
  679. if(type == '>') action = 0;
  680. else if(type == '=') action = 1;
  681. else if(type == '~') action = 2;
  682. VEC_PUSH(&extra.cases, ((state_case_info){
  683. .type = 1,
  684. .cs_name = set_name,
  685. .dest_state = fail_to,
  686. .action = action,
  687. .invert = invert,
  688. }));
  689. // printf("//Generating extra case <%s> with type/action <%c/%d> for set <%s>\n", set_name, type, action, set_name);
  690. (void)fail_to;
  691. (void)set_name;
  692. // state_case_info* ci = add_case_cset(pst, set_name, fail_to);
  693. // (void)ci;
  694. // free(fail_to);
  695. s = end;
  696. continue;
  697. }
  698. printf("unknown s: '%c'\n", *s);
  699. s++;
  700. }
  701. if(cached_word) {
  702. state_info* final = expand_word(cached_word, pst, &extra, retry_as, retry_as_cs_name, tctx, char_i_cmp);
  703. VEC_CAT(&final->terminal_data, &terminal_data);
  704. }
  705. else {
  706. VEC_CAT(&pst->terminal_data, &terminal_data);
  707. VEC_EACH(&extra.cases, i, ci) {
  708. add_case_cset(pst, ci.cs_name, ci.dest_state, ci.action, ci.invert);
  709. }
  710. }
  711. cached_word = NULL;
  712. VEC_EACH(&extra.cases, i, s) { free(s.cs_name); free(s.dest_state); }
  713. VEC_FREE(&extra.cases);
  714. VEC_FREE(&terminal_data);
  715. if(retry_as) free(retry_as);
  716. if(retry_as_cs_name) free(retry_as_cs_name);
  717. }
  718. // print_node(root);
  719. HT_EACH(&tctx->states, key, state_info*, si) {
  720. VEC_SORT(&si->c_cases, case_sort_fn);
  721. VEC_SORT(&si->cs_cases, case_cs_sort_fn);
  722. if(si->is_terminal) {
  723. // printf("t:%s %p, %s, %s\n", key, (void*)si, si->name, si->retry_as);
  724. VEC_PUSH(&tctx->terminals, si);
  725. }
  726. else {
  727. // printf("i:%s %p, %s\n", key, (void*)si, si->name);
  728. VEC_PUSH(&tctx->internals, si);
  729. }
  730. }
  731. VEC_SORT(&tctx->terminals, state_sort_fn);
  732. VEC_SORT(&tctx->internals, state_sort_fn);
  733. if(has_case_conflict) exit(1);
  734. if(print_macros) {
  735. if(print_guards) printf("#ifdef %s_MACROS\n", guard_prefix);
  736. printf(
  737. "#define push_char_id(_state) \\\n"
  738. "do { \\\n"
  739. " st->state = _state; \\\n"
  740. " goto PUSH_CHAR_RET; \\\n"
  741. "} while(0)\n"
  742. "\n"
  743. "\n"
  744. "#define discard_char_id(_state) \\\n"
  745. "do { \\\n"
  746. " st->state = _state; \\\n"
  747. " return 1; \\\n"
  748. "} while(0)\n"
  749. "\n"
  750. "\n"
  751. "#define retry_as(_state) \\\n"
  752. "do { \\\n"
  753. " st->state = _state; \\\n"
  754. " goto RETRY; \\\n"
  755. "} while(0);\n"
  756. "\n"
  757. "#define done_zero_move(_state) \\\n"
  758. "do { \\\n"
  759. " st->state = _state; \\\n"
  760. " goto TOKEN_DONE; \\\n"
  761. "} while(0);\n"
  762. "\n"
  763. "#define push_char_done(_state) \\\n"
  764. "do { \\\n"
  765. " st->state = _state; \\\n"
  766. " goto PUSH_CHAR_DONE; \\\n"
  767. "} while(0);\n"
  768. "\n"
  769. "#define charset_has(cs, c) (c <= cs##_len && !!cs[c])\n"
  770. );
  771. if(print_guards) printf("#endif // %s_MACROS\n\n\n", guard_prefix);
  772. }
  773. if(print_enum_names) {
  774. if(print_guards) printf("#ifdef %s_ENUM_NAMES\n", guard_prefix);
  775. printf("// terminals\n");
  776. VEC_EACH(&tctx->terminals, i, t) {
  777. printf("[%s] = \"%s\",\n", t->name, t->name);
  778. }
  779. printf("\n// internals\n");
  780. VEC_EACH(&tctx->internals, i, t) {
  781. printf("[%s] = \"%s\",\n", t->name, t->name);
  782. }
  783. if(print_guards) printf("#endif // %s_ENUM_NAMES\n\n\n", guard_prefix);
  784. }
  785. if(print_enums) {
  786. if(print_guards) printf("#ifdef %s_ENUMS\n", guard_prefix);
  787. printf("// terminals\n");
  788. VEC_EACH(&tctx->terminals, i, t) {
  789. printf("%s,\n", t->name);
  790. }
  791. printf("\n// internals\n");
  792. VEC_EACH(&tctx->internals, i, t) {
  793. printf("%s,\n", t->name);
  794. }
  795. if(print_guards) printf("#endif // %s_ENUMS\n\n\n", guard_prefix);
  796. }
  797. if(print_data_defs) {
  798. if(print_guards) printf("#ifdef %s_TERMINAL_DATA_DEFS\n", guard_prefix);
  799. VEC_EACH(&tctx->terminals, i, t) {
  800. if(VEC_LEN(&t->terminal_data)) {
  801. printf("char* %s%s[] = {", data_def_prefix, t->name);
  802. VEC_EACH(&t->terminal_data, i2, dp) {
  803. char* k = dp.key ? dp.key : "";
  804. char* v = dp.val ? dp.val : "";
  805. printf("\"%s\", \"%s\", ", k, v); // TODO: string escaping
  806. }
  807. printf("NULL};\n");
  808. }
  809. }
  810. if(print_guards) printf("#endif // %s_TERMINAL_DATA_DEFS\n\n\n", guard_prefix);
  811. }
  812. if(print_data) {
  813. if(print_guards) printf("#ifdef %s_TERMINAL_DATA\n", guard_prefix);
  814. VEC_EACH(&tctx->terminals, i, t) {
  815. if(VEC_LEN(&t->terminal_data)) {
  816. printf("[%s] = %s%s,\n", t->name, data_def_prefix, t->name);
  817. }
  818. }
  819. if(print_guards) printf("#endif // %s_TERMINAL_DATA\n\n\n", guard_prefix);
  820. }
  821. if(print_unique_data) {
  822. if(print_guards) printf("#ifdef %s_UNIQUE_TERMINAL_PAIRS\n", guard_prefix);
  823. struct strtrip {
  824. char* sname, *key, *val;
  825. };
  826. HT(struct strpair*) table;
  827. HT_init(&table, VEC_LEN(&tctx->terminals) * 1.5);
  828. VEC_EACH(&tctx->terminals, i, t) {
  829. VEC_EACHP(&t->terminal_data, i2, tp) {
  830. char* k = malloc(strlen(tp->key) + 2 + strlen(tp->val));
  831. strcpy(k, tp->key);
  832. strcat(k, "=");
  833. strcat(k, tp->val);
  834. HT_set(&table, k, tp);
  835. free(k);
  836. // printf("[%s] = %s%s,\n", t->name, data_def_prefix, t->name);
  837. }
  838. }
  839. HT_EACH(&table, k, strpair*, tp) {
  840. printf("TDX(%s, %s)\n", tp->key, tp->val);
  841. }
  842. HT_destroy(&table);
  843. if(print_guards) printf("#endif // %s_UNIQUE_TERMINAL_PAIRS\n\n\n", guard_prefix);
  844. }
  845. if(print_triplet_data) {
  846. if(print_guards) printf("#ifdef %s_TERMINAL_TRIPLETS\n", guard_prefix);
  847. VEC_EACH(&tctx->terminals, i, t) {
  848. VEC_EACHP(&t->terminal_data, i2, tp) {
  849. printf("TDX(%s, %s, %s)\n", t->name, tp->key, tp->val);
  850. }
  851. }
  852. if(print_guards) printf("#endif // %s_TERMINAL_TRIPLETS\n\n\n", guard_prefix);
  853. }
  854. if(print_csets) {
  855. if(print_guards) printf("#ifdef %s_CSETS\n", guard_prefix);
  856. HT_EACH(&tctx->csets, key, charset*, cs) {
  857. printf("char cset_%s[] = {", cs->name);
  858. for(int i = 0; i < cs->minval; i++) printf("0,");
  859. for(int i = cs->minval; i <= cs->maxval; i++) printf("%d,", !!cs->table[i - cs->minval]);
  860. printf("0};\n");
  861. printf("int cset_%s_len = %d;\n", cs->name, cs->maxval);
  862. }
  863. if(print_guards) printf("#endif // %s_CSETS\n\n\n", guard_prefix);
  864. }
  865. if(print_switch) {
  866. if(print_guards) printf("#ifdef %s_SWITCH\n", guard_prefix);
  867. printf("\n// terminals\n");
  868. VEC_EACH(&tctx->terminals, i, si) {
  869. print_state_switch(si);
  870. }
  871. printf("\n// internals\n");
  872. VEC_EACH(&tctx->internals, i, si) {
  873. print_state_switch(si);
  874. }
  875. // HT_EACH(&tctx->states, key, state_info*, si) {
  876. // print_state_switch(si);
  877. //
  878. // }
  879. if(print_guards) printf("#endif // %s_SWITCH\n\n\n", guard_prefix);
  880. }
  881. return 0;
  882. }