lexer.c 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <assert.h>
  5. #include <math.h>
  6. #include "../../../sti.h"
  7. #include "lexer.h"
  8. char* state_names[] = {
  9. [LST_INVALID] = "LST_INVALID",
  10. #define PARSER_INCLUDE_ENUM_NAMES
  11. #include "./parser_example_generated.h"
  12. #undef PARSER_INCLUDE_ENUM_NAMES
  13. [LST_MAXVALUE] = "LST_MAXVALUE",
  14. };
  15. #define PARSER_INCLUDE_TERMINAL_DATA_DEFS
  16. #include "./parser_example_generated.h"
  17. #undef PARSER_INCLUDE_TERMINAL_DATA_DEFS
  18. char** state_data[] = {
  19. #define PARSER_INCLUDE_TERMINAL_DATA
  20. #include "./parser_example_generated.h"
  21. #undef PARSER_INCLUDE_TERMINAL_DATA
  22. };
  23. // this is for the incremental lexing of each token, not the whole stream
  24. struct lexer_state {
  25. enum LexState state;
  26. char* buffer;
  27. int blen;
  28. int balloc;
  29. int linenum;
  30. int charnum;
  31. enum LexState tokenState;
  32. int tokenFinished; // buffer should be consumed and cleaned at this point
  33. };
  34. static int eatchar(struct lexer_state* st, int c) {
  35. #define PARSER_INCLUDE_CSETS
  36. #include "./parser_example_generated.h"
  37. #undef PARSER_INCLUDE_CSETS
  38. #define push_char_id(_state) \
  39. do { \
  40. st->state = _state; \
  41. goto PUSH_CHAR_RET; \
  42. } while(0)
  43. #define discard_char_id(_state) \
  44. do { \
  45. st->state = _state; \
  46. return 1; \
  47. } while(0)
  48. #define retry_as(_state) \
  49. do { \
  50. st->state = _state; \
  51. goto RETRY; \
  52. } while(0);
  53. #define done_zero_move(_state) \
  54. do { \
  55. st->state = _state; \
  56. goto TOKEN_DONE; \
  57. } while(0);
  58. #define push_char_done(_state) \
  59. do { \
  60. st->state = _state; \
  61. goto PUSH_CHAR_DONE; \
  62. } while(0);
  63. #define charset_has(cs, c) (c <= cs##_len && !!cs[c])
  64. // hopefully this works
  65. st->charnum++;
  66. if(c == '\n') {
  67. st->linenum++;
  68. st->charnum = 0;
  69. }
  70. RETRY:
  71. switch(st->state) {
  72. #define PARSER_INCLUDE_SWITCH
  73. #include "./parser_example_generated.h"
  74. #undef PARSER_INCLUDE_SWITCH
  75. default:
  76. printf("Lexer reached default: %d\n", st->state);
  77. st->state = LST_NULL;
  78. return 0;
  79. }
  80. assert(0);
  81. // never gets here
  82. ERROR:
  83. //printf("Lexer error at line %d:%d: state %d(%s) %d='%c' \n", st->linenum, st->charnum, st->state, state_names[st->state], c, c);
  84. st->state = LST_NULL;
  85. st->blen = 0;
  86. return 1;
  87. TOKEN_DONE:
  88. st->tokenFinished = 1;
  89. st->tokenState = st->state;
  90. return 0;
  91. PUSH_CHAR_RET:
  92. st->buffer[st->blen] = c;
  93. st->blen++;
  94. return 1;
  95. PUSH_CHAR_DONE:
  96. st->buffer[st->blen] = c;
  97. st->blen++;
  98. st->tokenFinished = 1;
  99. st->tokenState = st->state;
  100. return 1;
  101. }
  102. TokenStream* LoadAndLexFile(char* path) {
  103. char* source = readWholeFile(path, NULL);
  104. TokenStream* ts = calloc(1, sizeof(*ts));
  105. struct lexer_state ls = {
  106. .state = 0,
  107. .balloc = 256,
  108. .blen = 0,
  109. .buffer = calloc(1, 256),
  110. .state = LST_NULL,
  111. .linenum = 0,
  112. .charnum = 0,
  113. .tokenState = 0,
  114. .tokenFinished = 0,
  115. };
  116. for(int i = 0; source[i];) {
  117. int ret;
  118. ret = eatchar(&ls, source[i]);
  119. if(ls.tokenFinished) {
  120. // token is ready
  121. VEC_INC(&ts->tokens);
  122. LexerToken* t = &VEC_TAIL(&ts->tokens);
  123. t->tokenState = ls.tokenState;
  124. t->tokenText = strndup(ls.buffer, ls.blen);
  125. t->line = ls.linenum;
  126. t->character = ls.charnum;
  127. t->sourceFile = NULL;
  128. // printf("got token: #%d (%s) '%.*s'\n", ls.tokenState, state_names[ls.tokenState], ls.blen, ls.buffer);
  129. // reset the lex state when done reading
  130. ls.tokenFinished = 0;
  131. ls.state = LST_NULL;
  132. ls.blen = 0;
  133. }
  134. if(ret) {
  135. i++; // advance on ret == 1
  136. }
  137. }
  138. // printf("last token: #%d (%s) '%.*s'\n", ls.tokenState, state_names[ls.tokenState], ls.blen, ls.buffer);
  139. free(source);
  140. return ts;
  141. }