lexer.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834
  1. #include <stdlib.h>
  2. #include <stdio.h> // only for debugging
  3. #include "lexer.h"
  4. #define A(c) (('a' <= (c) && (c) <= 'z') || ('A' <= (c) && (c) <= 'Z'))
  5. #define D(c) (('0' <= (c) && (c) <= '9'))
  6. #define AN(c) (A(c) || D(c))
  7. #define WS(c) ((c) == ' ' || (c) == '\t' || (c) == '\v')
  8. #define NL(c) ((c) == '\r' || (c) == '\n')
  9. #define ESCAPED_LB(found, invalid) \
  10. if(s[0] == '\\') { \
  11. if(s[1] == '\r') { \
  12. if(s[2] == '\n') s++; \
  13. s += 2; \
  14. line++; \
  15. col = 0; \
  16. found; \
  17. } \
  18. else if(s[1] == '\n') { \
  19. s += 2; \
  20. line++; \
  21. col = 0; \
  22. found; \
  23. } \
  24. \
  25. invalid; \
  26. }
  27. #define CHECK_BUF(t, n) \
  28. do { \
  29. if(n + 1 >= (t)->alloc) { \
  30. (t)->alloc *= 2; \
  31. (t)->text = realloc((t)->text, (t)->alloc * sizeof(*(t)->text)); \
  32. buf = (t)->text; \
  33. } \
  34. } while(0);
  35. int is_token(lexer_source_t* src, lexer_token_t* t) {
  36. t->has_newline = 0;
  37. if(src->head >= src->text + src->len - 1) {
  38. t->len = 0;
  39. return 0;
  40. }
  41. char* s = src->head;
  42. char* os = s;
  43. char* buf = t->text;
  44. int line = t->start_line;
  45. int col = t->start_col;
  46. int was_e = 0;
  47. RESTART:
  48. int n = 1;
  49. int c = s[0];
  50. buf[0] = c;
  51. switch(c) {
  52. case '\\':
  53. if(s[1] == '\r') {
  54. if(s[2] == '\n') s++;
  55. s += 2;
  56. line++;
  57. col = 0;
  58. goto RESTART;
  59. }
  60. else if(s[1] == '\n') {
  61. s += 2;
  62. line++;
  63. col = 0;
  64. goto RESTART;
  65. }
  66. // unicode escape sequences *in the code itself* are not supported. GTFO with that IOCCC shit.
  67. goto INVALID;
  68. case '\r':
  69. case '\n':
  70. case ' ':
  71. case '\t':
  72. case '\v':
  73. n = 0;
  74. while(1) {
  75. if(*s == ' ' || *s == '\t' || *s == '\v') {
  76. CHECK_BUF(t, n);
  77. buf[n] = *s;
  78. n++; s++; col++;
  79. continue;
  80. }
  81. if(*s == '\r') {
  82. CHECK_BUF(t, n);
  83. buf[n++] = *s;
  84. s++;
  85. if(*s == '\n') {
  86. CHECK_BUF(t, n);
  87. buf[n++] = *s;
  88. s++;
  89. }
  90. line++;
  91. col = 0;
  92. t->has_newline = 1;
  93. continue;
  94. }
  95. if(*s == '\n') {
  96. CHECK_BUF(t, n);
  97. buf[n++] = *s;
  98. s++;
  99. line++;
  100. col = 0;
  101. t->has_newline = 1;
  102. continue;
  103. }
  104. ESCAPED_LB(continue, );
  105. break;
  106. }
  107. t->type = LEXER_TOK_SPACE;
  108. goto RETURN;
  109. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
  110. case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
  111. case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
  112. case 'v': case 'w': case 'x': case 'y': case 'z':
  113. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
  114. case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
  115. case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
  116. case 'V': case 'W': case 'X': case 'Y': case 'Z':
  117. case '_':
  118. goto IDENT;
  119. case '\'':
  120. goto CHARLIT;
  121. case '"':
  122. goto STRING;
  123. case '[': case ']':
  124. case '{': case '}':
  125. case '(': case ')':
  126. case '?': case ';':
  127. case '~': case ',':
  128. goto PUNCT1;
  129. case '*':
  130. if(s[1] == '=') goto PUNCT2;
  131. if(s[1] == '\\') goto SLOW;
  132. goto PUNCT1;
  133. case '#':
  134. if(s[1] == '#') goto PUNCT2;
  135. if(s[1] == '\\') goto SLOW;
  136. goto PUNCT1;
  137. case '!':
  138. if(s[1] == '=') goto PUNCT2;
  139. if(s[1] == '\\') goto SLOW;
  140. goto PUNCT1;
  141. case '^':
  142. if(s[1] == '=') goto PUNCT2;
  143. if(s[1] == '\\') goto SLOW;
  144. goto PUNCT1;
  145. case ':':
  146. if(s[1] == '>') goto PUNCT2;
  147. if(s[1] == '\\') goto SLOW;
  148. goto PUNCT1;
  149. case '=':
  150. if(s[1] == '=') goto PUNCT2;
  151. if(s[1] == '\\') goto SLOW;
  152. goto PUNCT1;
  153. case '+':
  154. if(s[1] == '=' || s[1] == '+') goto PUNCT2;
  155. if(s[1] == '\\') goto SLOW;
  156. goto PUNCT1;
  157. case '|':
  158. if(s[1] == '=' || s[1] == '|') goto PUNCT2;
  159. if(s[1] == '\\') goto SLOW;
  160. goto PUNCT1;
  161. case '&':
  162. if(s[1] == '=' || s[1] == '&') goto PUNCT2;
  163. if(s[1] == '\\') goto SLOW;
  164. goto PUNCT1;
  165. case '%':
  166. if(s[1] == ':') {
  167. if(s[2] == '%' && s[3] == ':') goto PUNCT4;
  168. if(s[2] == '%' && s[3] == '\\') goto SLOW;
  169. if(s[2] == '\\') goto SLOW;
  170. goto PUNCT2;
  171. }
  172. if(s[1] == '=' || s[1] == '>') goto PUNCT2;
  173. if(s[1] == '\\') goto SLOW;
  174. goto PUNCT1;
  175. case '-':
  176. if(s[1] == '=' || s[1] == '>' || s[1] == '-') goto PUNCT2;
  177. if(s[1] == '\\') goto SLOW;
  178. goto PUNCT1;
  179. case '/':
  180. if(s[1] == '/') goto SL_COMMENT;
  181. if(s[1] == '*') goto ML_COMMENT;
  182. if(s[1] == '=') goto PUNCT2;
  183. if(s[1] == '\\') goto SLOW;
  184. goto PUNCT1;
  185. case '>':
  186. if(s[1] == '=') goto PUNCT2;
  187. if(s[1] == '>') {
  188. if(s[2] == '=') goto PUNCT3;
  189. if(s[2] == '\\') goto SLOW;
  190. goto PUNCT2;
  191. }
  192. if(s[1] == '\\') goto SLOW;
  193. goto PUNCT1;
  194. case '<':
  195. if(s[1] == '=' || s[1] == ':' || s[1] == '%') goto PUNCT2;
  196. if(s[1] == '<') {
  197. if(s[2] == '=') goto PUNCT3;
  198. if(s[2] == '\\') goto SLOW;
  199. goto PUNCT2;
  200. }
  201. if(s[1] == '\\') goto SLOW;
  202. goto PUNCT1;
  203. case '.':
  204. if(s[1] == '.') {
  205. if(s[2] == '.') goto PUNCT3;
  206. if(s[2] == '\\') goto SLOW;
  207. goto PUNCT1; // only return the first '.'
  208. }
  209. if(s[1] == '\\') goto SLOW;
  210. goto NUMBER;
  211. case '0': case '1': case '2': case '3': case '4':
  212. case '5': case '6': case '7': case '8': case '9':
  213. goto NUMBER;
  214. default:
  215. goto INVALID;
  216. }
  217. IDENT:
  218. s++;
  219. while(1) {
  220. if(AN(*s) || *s == '_') {
  221. CHECK_BUF(t, n);
  222. buf[n] = *s;
  223. n++; s++; col++;
  224. continue;
  225. }
  226. ESCAPED_LB(continue, );
  227. break;
  228. }
  229. t->type = LEXER_TOK_IDENT;
  230. goto RETURN;
  231. NUMBER:
  232. s++;
  233. while(1) {
  234. ESCAPED_LB(continue, );
  235. if(was_e) {
  236. if(*s == '+' || *s == '-') {
  237. CHECK_BUF(t, n);
  238. buf[n] = *s;
  239. n++; s++; col++;
  240. continue;
  241. }
  242. was_e = 0;
  243. }
  244. if((n == 1 && D(*s)) || (n > 1 && AN(*s))) {
  245. CHECK_BUF(t, n);
  246. buf[n] = *s;
  247. n++; s++; col++;
  248. continue;
  249. }
  250. if(*s == 'e' || *s == 'E' || *s == 'p' || *s == 'P') {
  251. CHECK_BUF(t, n);
  252. buf[n++] = *s;
  253. s++; col++;
  254. was_e = 1;
  255. }
  256. break;
  257. }
  258. if(n == 1 && buf[0] == '.') {s--; goto PUNCT1; };
  259. t->type = LEXER_TOK_NUMBER;
  260. goto RETURN;
  261. SL_COMMENT:
  262. s++;
  263. while(1) {
  264. if(*s == '\r' || *s == '\n') {
  265. line++;
  266. break;
  267. }
  268. ESCAPED_LB(continue, );
  269. CHECK_BUF(t, n);
  270. buf[n++] = *s;
  271. s++; col++;
  272. }
  273. t->type = LEXER_TOK_COMMENT;
  274. goto RETURN;
  275. ML_COMMENT: {
  276. s++;
  277. int state = 0; // used to check for the * / sequence
  278. while(1) {
  279. ESCAPED_LB(continue, );
  280. if(state == 1) {
  281. if(*s == '/') {
  282. CHECK_BUF(t, n);
  283. buf[n++] = *s;
  284. s++; col++;
  285. break;
  286. }
  287. state = 0;
  288. }
  289. if(*s == '*') state = 1;
  290. if(*s == '\n') line++;
  291. CHECK_BUF(t, n);
  292. buf[n++] = *s;
  293. s++; col++;
  294. }
  295. t->type = LEXER_TOK_COMMENT;
  296. goto RETURN;
  297. }
  298. STRING: {
  299. int state = 0;
  300. s++;
  301. while(1) {
  302. if(state == 1) {
  303. state = 0;
  304. if(*s == '"') {
  305. CHECK_BUF(t, n);
  306. buf[n++] = *s;
  307. s++; col++;
  308. continue;
  309. }
  310. }
  311. if(*s == '"') break;
  312. ESCAPED_LB(continue, state = 1);
  313. CHECK_BUF(t, n);
  314. buf[n++] = *s;
  315. s++; col++;
  316. }
  317. CHECK_BUF(t, n);
  318. buf[n++] = *s;
  319. s++; col++;
  320. t->type = LEXER_TOK_STRING;
  321. goto RETURN;
  322. }
  323. CHARLIT: {
  324. s++;
  325. int state = 0;
  326. while(1) {
  327. if(state == 1) {
  328. state = 0;
  329. if(*s != 0) {
  330. CHECK_BUF(t, n);
  331. buf[n++] = *s;
  332. s++; col++;
  333. continue;
  334. }
  335. }
  336. if(*s == '\'') break;
  337. ESCAPED_LB(continue, state = 1);
  338. CHECK_BUF(t, n);
  339. buf[n++] = *s;
  340. s++; col++;
  341. }
  342. CHECK_BUF(t, n);
  343. buf[n++] = *s;
  344. s++; col++;
  345. t->type = LEXER_TOK_CONST;
  346. goto RETURN;
  347. }
  348. PUNCT4:
  349. buf[3] = s[3];
  350. buf[2] = s[2];
  351. buf[1] = s[1];
  352. n = 4;
  353. s += 3;
  354. goto PUNCT1;
  355. PUNCT3:
  356. buf[2] = s[2];
  357. buf[1] = s[1];
  358. n = 3;
  359. s += 2;
  360. goto PUNCT1;
  361. PUNCT2:
  362. buf[1] = s[1];
  363. n = 2;
  364. s += 1;
  365. goto PUNCT1;
  366. PUNCT1:
  367. s++;
  368. t->type = LEXER_TOK_PUNCT;
  369. goto RETURN;
  370. INVALID:
  371. t->type = LEXER_TOK_INVALID;
  372. s++;
  373. goto RETURN;
  374. SLOW:
  375. // Escaped linebreaks are rare but also a pain in the ass.
  376. // There's a dedicated "correct" function for processing them.
  377. // It only needs to handle the multicharacter punctuation.
  378. return is_token_slow(src, t);
  379. RETURN:
  380. t->end_line = line;
  381. t->end_col = col;
  382. t->len = n;
  383. src->head = s;
  384. return n;
  385. }
  386. // states
  387. enum {
  388. _NONE = 0,
  389. _STAR,
  390. _HASH,
  391. _BANG,
  392. _CARET,
  393. _COLON,
  394. _EQUAL,
  395. _PLUS,
  396. _PIPE,
  397. _AMP,
  398. _PCT,
  399. _PCT_COLON,
  400. _PCT_COLON_PCT,
  401. _MINUS,
  402. _SLASH,
  403. _GT,
  404. _GT_GT,
  405. _LT,
  406. _LT_LT,
  407. _DOT,
  408. _DOT_DOT,
  409. };
  410. int is_token_slow(lexer_source_t* src, lexer_token_t* t) {
  411. char* s = src->head;
  412. char* os = s;
  413. char* buf = t->text;
  414. int line = t->start_line;
  415. int col = t->start_col;
  416. int was_e = 0;
  417. int n = 0;
  418. int state = _NONE;
  419. printf("slow method \n");
  420. START:
  421. int c = s[0];
  422. if(*s == '\\') {
  423. printf("escape\n");
  424. if(s[1] == '\r') {
  425. if(s[2] == '\n') s++;
  426. s += 2;
  427. line++;
  428. col = 0;
  429. goto START;
  430. }
  431. else if(s[1] == '\n') {
  432. printf("enl\n");
  433. s += 2;
  434. line++;
  435. col = 0;
  436. goto START;
  437. }
  438. // unicode escape sequences *in the code itself* are not supported. GTFO with that IOCCC shit.
  439. goto INVALID;
  440. }
  441. // buf[n++] = *s;
  442. printf("c='%c', n=%d\n", *s, n);
  443. switch(state) {
  444. case _NONE:
  445. switch(*s) {
  446. case '*': state = _STAR; goto RESTART;
  447. case '#': state = _HASH; goto RESTART;
  448. case '!': state = _BANG; goto RESTART;
  449. case '^': state = _CARET; goto RESTART;
  450. case ':': state = _COLON; goto RESTART;
  451. case '=': state = _EQUAL; goto RESTART;
  452. case '+': state = _PLUS; goto RESTART;
  453. case '|': state = _PIPE; goto RESTART;
  454. case '&': state = _AMP; goto RESTART;
  455. case '%': state = _PCT; goto RESTART;
  456. case '-': state = _MINUS; goto RESTART;
  457. case '/': state = _SLASH; goto RESTART;
  458. case '>': state = _GT; goto RESTART;
  459. case '<': state = _LT; goto RESTART;
  460. case '.': state = _DOT; goto RESTART;
  461. }
  462. case _STAR:
  463. if(s[0] == '=') goto PUNCT2;
  464. goto PUNCT1;
  465. case _HASH:
  466. if(s[0] == '#') goto PUNCT2;
  467. goto PUNCT1;
  468. case _BANG:
  469. if(s[0] == '=') goto PUNCT2;
  470. goto PUNCT1;
  471. case _CARET:
  472. if(s[0] == '=') goto PUNCT2;
  473. goto PUNCT1;
  474. case _COLON:
  475. if(s[0] == '>') goto PUNCT2;
  476. goto PUNCT1;
  477. case _EQUAL:
  478. if(s[0] == '=') goto PUNCT2;
  479. goto PUNCT1;
  480. case _PLUS:
  481. if(s[0] == '=' || s[0] == '+') goto PUNCT2;
  482. goto PUNCT1;
  483. case _PIPE:
  484. if(s[0] == '=' || s[0] == '|') goto PUNCT2;
  485. goto PUNCT1;
  486. case _AMP:
  487. if(s[0] == '=' || s[0] == '&') goto PUNCT2;
  488. goto PUNCT1;
  489. case _PCT:
  490. if(s[0] == '=' || s[0] == '>') goto PUNCT2;
  491. if(s[0] == ':') { state = _PCT_COLON; goto RESTART; }
  492. goto PUNCT1;
  493. case _PCT_COLON:
  494. if(s[0] == '%') { state = _PCT_COLON_PCT; goto RESTART; }
  495. goto PUNCT2;
  496. case _PCT_COLON_PCT:
  497. if(s[0] == ':') goto PUNCT4;
  498. goto PUNCT2;
  499. case _MINUS:
  500. if(s[0] == '=' || s[0] == '>' || s[0] == '-') goto PUNCT2;
  501. goto PUNCT1;
  502. case _SLASH:
  503. if(s[0] == '/') goto SL_COMMENT;
  504. if(s[0] == '*') goto ML_COMMENT;
  505. if(s[0] == '=') goto PUNCT2;
  506. goto PUNCT1;
  507. case _GT:
  508. if(s[0] == '=') goto PUNCT2;
  509. if(s[0] == '>') { state = _GT_GT; goto RESTART; }
  510. goto PUNCT1;
  511. case _GT_GT:
  512. if(s[0] == '=') goto PUNCT3;
  513. goto PUNCT2;
  514. case _LT:
  515. if(s[0] == '=') goto PUNCT2;
  516. if(s[0] == '<') { state = _LT_LT; goto RESTART; }
  517. goto PUNCT1;
  518. case _LT_LT:
  519. if(s[0] == '=') goto PUNCT3;
  520. goto PUNCT2;
  521. case _DOT:
  522. printf("dot\n");
  523. if(s[0] == '.') { state = _DOT_DOT; goto RESTART; }
  524. goto NUMBER;
  525. case _DOT_DOT:
  526. printf("dot_dot\n");
  527. if(s[0] == '.') goto PUNCT3;
  528. goto PUNCT1;
  529. }
  530. RESTART:
  531. CHECK_BUF(t, n);
  532. buf[n++] = *s;
  533. s++; col++;
  534. goto START;
  535. PUNCT4:
  536. // buf[3] = s[3];
  537. // buf[2] = s[2];
  538. // buf[1] = s[1];
  539. // n = 4;
  540. goto PUNCT1;
  541. PUNCT3:
  542. // buf[2] = s[2];
  543. // buf[1] = s[1];
  544. // n = 3;
  545. goto PUNCT1;
  546. PUNCT2:
  547. // buf[1] = s[1];
  548. // n = 2;
  549. goto PUNCT1;
  550. PUNCT1:
  551. buf[n++] = *s;
  552. t->type = LEXER_TOK_PUNCT;
  553. goto RETURN;
  554. NUMBER:
  555. printf("num--------\n");
  556. while(1) {
  557. printf("%c n=%d\n", *s, n);
  558. ESCAPED_LB(continue, );
  559. if(was_e) {
  560. if(*s == '+' || *s == '-') {
  561. CHECK_BUF(t, n);
  562. buf[n] = *s;
  563. n++; s++; col++;
  564. continue;
  565. }
  566. was_e = 0;
  567. }
  568. if(n == 1 && !D(*s)) {
  569. t->type = LEXER_TOK_PUNCT;
  570. goto RETURN;
  571. }
  572. if(n > 1 && AN(*s)) {
  573. CHECK_BUF(t, n);
  574. buf[n] = *s;
  575. n++; s++; col++;
  576. continue;
  577. }
  578. if(*s == 'e' || *s == 'E' || *s == 'p' || *s == 'P') {
  579. CHECK_BUF(t, n);
  580. buf[n++] = *s;
  581. s++; col++;
  582. was_e = 1;
  583. }
  584. break;
  585. }
  586. if(n == 1 && buf[0] == '.') goto PUNCT1;
  587. t->type = LEXER_TOK_NUMBER;
  588. goto RETURN;
  589. SL_COMMENT:
  590. while(1) {
  591. if(*s == '\r' || *s == '\n') {
  592. break;
  593. }
  594. ESCAPED_LB(continue, );
  595. CHECK_BUF(t, n);
  596. buf[n++] = *s;
  597. s++; col++;
  598. }
  599. t->type = LEXER_TOK_COMMENT;
  600. goto RETURN;
  601. ML_COMMENT: {
  602. s++;
  603. int state = 0;
  604. while(1) {
  605. ESCAPED_LB(continue, );
  606. if(state == 1) {
  607. if(*s == '/') {
  608. CHECK_BUF(t, n);
  609. buf[n++] = *s;
  610. s++; col++;
  611. break;
  612. }
  613. state = 0;
  614. }
  615. if(*s == '*') state = 1;
  616. CHECK_BUF(t, n);
  617. buf[n++] = *s;
  618. s++; col++;
  619. }
  620. t->type = LEXER_TOK_COMMENT;
  621. goto RETURN;
  622. }
  623. INVALID:
  624. t->type = LEXER_TOK_INVALID;
  625. s++;
  626. goto RETURN;
  627. RETURN:
  628. t->end_line = line;
  629. t->end_col = col;
  630. t->len = n;
  631. src->head = s;
  632. return n;
  633. }
  634. char* lexer_token_type_names[] = {
  635. #define X(a) [LEXER_TOK_##a] = #a,
  636. LEXER_TOKEN_TYPE_LIST
  637. #undef X
  638. };