tokenize.scm 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. ;;; ECMAScript for Guile
  2. ;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
  3. ;;;; This library is free software; you can redistribute it and/or
  4. ;;;; modify it under the terms of the GNU Lesser General Public
  5. ;;;; License as published by the Free Software Foundation; either
  6. ;;;; version 3 of the License, or (at your option) any later version.
  7. ;;;;
  8. ;;;; This library is distributed in the hope that it will be useful,
  9. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. ;;;; Lesser General Public License for more details.
  12. ;;;;
  13. ;;;; You should have received a copy of the GNU Lesser General Public
  14. ;;;; License along with this library; if not, write to the Free Software
  15. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  16. ;;; Code:
  17. (define-module (language ecmascript tokenize)
  18. #:use-module (ice-9 rdelim)
  19. #:use-module ((srfi srfi-1) #:select (unfold-right))
  20. #:use-module (system base lalr)
  21. #:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1))
  22. (define (syntax-error what loc form . args)
  23. (throw 'syntax-error #f what
  24. (and=> loc source-location->source-properties)
  25. form #f args))
  26. (define (port-source-location port)
  27. (make-source-location (port-filename port)
  28. (port-line port)
  29. (port-column port)
  30. (false-if-exception (ftell port))
  31. #f))
  32. ;; taken from SSAX, sorta
  33. (define (read-until delims port loc)
  34. (if (eof-object? (peek-char port))
  35. (syntax-error "EOF while reading a token" loc #f)
  36. (let ((token (read-delimited delims port 'peek)))
  37. (if (eof-object? (peek-char port))
  38. (syntax-error "EOF while reading a token" loc token)
  39. token))))
  40. (define (char-hex? c)
  41. (and (not (eof-object? c))
  42. (or (char-numeric? c)
  43. (memv c '(#\a #\b #\c #\d #\e #\f))
  44. (memv c '(#\A #\B #\C #\D #\E #\F)))))
  45. (define (digit->number c)
  46. (- (char->integer c) (char->integer #\0)))
  47. (define (hex->number c)
  48. (if (char-numeric? c)
  49. (digit->number c)
  50. (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a)))))
  51. (define (read-slash port loc div?)
  52. (let ((c1 (begin
  53. (read-char port)
  54. (peek-char port))))
  55. (cond
  56. ((eof-object? c1)
  57. ;; hmm. error if we're not looking for a div? ?
  58. (make-lexical-token '/ loc #f))
  59. ((char=? c1 #\/)
  60. (read-line port)
  61. (next-token port div?))
  62. ((char=? c1 #\*)
  63. (read-char port)
  64. (let lp ((c (read-char port)))
  65. (cond
  66. ((eof-object? c)
  67. (syntax-error "EOF while in multi-line comment" loc #f))
  68. ((char=? c #\*)
  69. (if (eqv? (peek-char port) #\/)
  70. (begin
  71. (read-char port)
  72. (next-token port div?))
  73. (lp (read-char port))))
  74. (else
  75. (lp (read-char port))))))
  76. (div?
  77. (case c1
  78. ((#\=) (read-char port) (make-lexical-token '/= loc #f))
  79. (else (make-lexical-token '/ loc #f))))
  80. (else
  81. (read-regexp port loc)))))
  82. (define (read-regexp port loc)
  83. ;; first slash already read
  84. (let ((terms (string #\/ #\\ #\nl #\cr)))
  85. (let lp ((str (read-until terms port loc)) (head ""))
  86. (let ((terminator (peek-char port)))
  87. (cond
  88. ((char=? terminator #\/)
  89. (read-char port)
  90. ;; flags
  91. (let lp ((c (peek-char port)) (flags '()))
  92. (if (or (eof-object? c)
  93. (not (or (char-alphabetic? c)
  94. (char-numeric? c)
  95. (char=? c #\$)
  96. (char=? c #\_))))
  97. (make-lexical-token 'RegexpLiteral loc
  98. (cons (string-append head str)
  99. (reverse flags)))
  100. (begin (read-char port)
  101. (lp (peek-char port) (cons c flags))))))
  102. ((char=? terminator #\\)
  103. (read-char port)
  104. (let ((echar (read-char port)))
  105. (lp (read-until terms port loc)
  106. (string-append head str (string #\\ echar)))))
  107. (else
  108. (syntax-error "regexp literals may not contain newlines"
  109. loc str)))))))
  110. (define (read-string port loc)
  111. (let ((c (read-char port)))
  112. (let ((terms (string c #\\ #\nl #\cr)))
  113. (define (read-escape port)
  114. (let ((c (read-char port)))
  115. (case c
  116. ((#\' #\" #\\) c)
  117. ((#\b) #\bs)
  118. ((#\f) #\np)
  119. ((#\n) #\nl)
  120. ((#\r) #\cr)
  121. ((#\t) #\tab)
  122. ((#\v) #\vt)
  123. ((#\0)
  124. (let ((next (peek-char port)))
  125. (cond
  126. ((eof-object? next) #\nul)
  127. ((char-numeric? next)
  128. (syntax-error "octal escape sequences are not supported"
  129. loc #f))
  130. (else #\nul))))
  131. ((#\x)
  132. (let* ((a (read-char port))
  133. (b (read-char port)))
  134. (cond
  135. ((and (char-hex? a) (char-hex? b))
  136. (integer->char (+ (* 16 (hex->number a)) (hex->number b))))
  137. (else
  138. (syntax-error "bad hex character escape" loc (string a b))))))
  139. ((#\u)
  140. (let* ((a (read-char port))
  141. (b (read-char port))
  142. (c (read-char port))
  143. (d (read-char port)))
  144. (integer->char (string->number (string a b c d) 16))))
  145. (else
  146. c))))
  147. (let lp ((str (read-until terms port loc)))
  148. (let ((terminator (peek-char port)))
  149. (cond
  150. ((char=? terminator c)
  151. (read-char port)
  152. (make-lexical-token 'StringLiteral loc str))
  153. ((char=? terminator #\\)
  154. (read-char port)
  155. (let ((echar (read-escape port)))
  156. (lp (string-append str (string echar)
  157. (read-until terms port loc)))))
  158. (else
  159. (syntax-error "string literals may not contain newlines"
  160. loc str))))))))
  161. (define *keywords*
  162. '(("break" . break)
  163. ("else" . else)
  164. ("new" . new)
  165. ("var" . var)
  166. ("case" . case)
  167. ("finally" . finally)
  168. ("return" . return)
  169. ("void" . void)
  170. ("catch" . catch)
  171. ("for" . for)
  172. ("switch" . switch)
  173. ("while" . while)
  174. ("continue" . continue)
  175. ("function" . function)
  176. ("this" . this)
  177. ("with" . with)
  178. ("default" . default)
  179. ("if" . if)
  180. ("throw" . throw)
  181. ("delete" . delete)
  182. ("in" . in)
  183. ("try" . try)
  184. ("do" . do)
  185. ("instanceof" . instanceof)
  186. ("typeof" . typeof)
  187. ;; these aren't exactly keywords, but hey
  188. ("null" . null)
  189. ("true" . true)
  190. ("false" . false)))
  191. (define *future-reserved-words*
  192. '(("abstract" . abstract)
  193. ("enum" . enum)
  194. ("int" . int)
  195. ("short" . short)
  196. ("boolean" . boolean)
  197. ("export" . export)
  198. ("interface" . interface)
  199. ("static" . static)
  200. ("byte" . byte)
  201. ("extends" . extends)
  202. ("long" . long)
  203. ("super" . super)
  204. ("char" . char)
  205. ("final" . final)
  206. ("native" . native)
  207. ("synchronized" . synchronized)
  208. ("class" . class)
  209. ("float" . float)
  210. ("package" . package)
  211. ("throws" . throws)
  212. ("const" . const)
  213. ("goto" . goto)
  214. ("private" . private)
  215. ("transient" . transient)
  216. ("debugger" . debugger)
  217. ("implements" . implements)
  218. ("protected" . protected)
  219. ("volatile" . volatile)
  220. ("double" . double)
  221. ("import" . import)
  222. ("public" . public)))
  223. (define (read-identifier port loc)
  224. (let lp ((c (peek-char port)) (chars '()))
  225. (if (or (eof-object? c)
  226. (not (or (char-alphabetic? c)
  227. (char-numeric? c)
  228. (char=? c #\$)
  229. (char=? c #\_))))
  230. (let ((word (list->string (reverse chars))))
  231. (cond ((assoc-ref *keywords* word)
  232. => (lambda (x) (make-lexical-token x loc #f)))
  233. ((assoc-ref *future-reserved-words* word)
  234. (syntax-error "word is reserved for the future, dude."
  235. loc word))
  236. (else (make-lexical-token 'Identifier loc
  237. (string->symbol word)))))
  238. (begin (read-char port)
  239. (lp (peek-char port) (cons c chars))))))
  240. (define (read-numeric port loc)
  241. (let* ((c0 (if (char=? (peek-char port) #\.)
  242. #\0
  243. (read-char port)))
  244. (c1 (peek-char port)))
  245. (cond
  246. ((eof-object? c1) (digit->number c0))
  247. ((and (char=? c0 #\0) (or (char=? c1 #\x) (char=? c1 #\X)))
  248. (read-char port)
  249. (let ((c (peek-char port)))
  250. (if (not (char-hex? c))
  251. (syntax-error "bad digit reading hexadecimal number"
  252. loc c))
  253. (let lp ((c c) (acc 0))
  254. (cond ((char-hex? c)
  255. (read-char port)
  256. (lp (peek-char port)
  257. (+ (* 16 acc) (hex->number c))))
  258. (else
  259. acc)))))
  260. ((and (char=? c0 #\0) (char-numeric? c1))
  261. (let lp ((c c1) (acc 0))
  262. (cond ((eof-object? c) acc)
  263. ((char-numeric? c)
  264. (if (or (char=? c #\8) (char=? c #\9))
  265. (syntax-error "invalid digit in octal sequence"
  266. loc c))
  267. (read-char port)
  268. (lp (peek-char port)
  269. (+ (* 8 acc) (digit->number c))))
  270. (else
  271. acc))))
  272. (else
  273. (let lp ((c1 c1) (acc (digit->number c0)))
  274. (cond
  275. ((eof-object? c1) acc)
  276. ((char-numeric? c1)
  277. (read-char port)
  278. (lp (peek-char port)
  279. (+ (* 10 acc) (digit->number c1))))
  280. ((or (char=? c1 #\e) (char=? c1 #\E))
  281. (read-char port)
  282. (let ((add (let ((c (peek-char port)))
  283. (cond ((eof-object? c)
  284. (syntax-error "error reading exponent: EOF"
  285. loc #f))
  286. ((char=? c #\+) (read-char port) +)
  287. ((char=? c #\-) (read-char port) -)
  288. ((char-numeric? c) +)
  289. (else
  290. (syntax-error "error reading exponent: non-digit"
  291. loc c))))))
  292. (let lp ((c (peek-char port)) (e 0))
  293. (cond ((and (not (eof-object? c)) (char-numeric? c))
  294. (read-char port)
  295. (lp (peek-char port) (add (* 10 e) (digit->number c))))
  296. (else
  297. (* (if (negative? e) (* acc 1.0) acc) (expt 10 e)))))))
  298. ((char=? c1 #\.)
  299. (read-char port)
  300. (let lp2 ((c (peek-char port)) (dec 0.0) (n -1))
  301. (cond ((and (not (eof-object? c)) (char-numeric? c))
  302. (read-char port)
  303. (lp2 (peek-char port)
  304. (+ dec (* (digit->number c) (expt 10 n)))
  305. (1- n)))
  306. (else
  307. ;; loop back to catch an exponential part
  308. (lp c (+ acc dec))))))
  309. (else
  310. acc)))))))
  311. (define *punctuation*
  312. '(("{" . lbrace)
  313. ("}" . rbrace)
  314. ("(" . lparen)
  315. (")" . rparen)
  316. ("[" . lbracket)
  317. ("]" . rbracket)
  318. ("." . dot)
  319. (";" . semicolon)
  320. ("," . comma)
  321. ("<" . <)
  322. (">" . >)
  323. ("<=" . <=)
  324. (">=" . >=)
  325. ("==" . ==)
  326. ("!=" . !=)
  327. ("===" . ===)
  328. ("!==" . !==)
  329. ("+" . +)
  330. ("-" . -)
  331. ("*" . *)
  332. ("%" . %)
  333. ("++" . ++)
  334. ("--" . --)
  335. ("<<" . <<)
  336. (">>" . >>)
  337. (">>>" . >>>)
  338. ("&" . &)
  339. ("|" . bor)
  340. ("^" . ^)
  341. ("!" . !)
  342. ("~" . ~)
  343. ("&&" . &&)
  344. ("||" . or)
  345. ("?" . ?)
  346. (":" . colon)
  347. ("=" . =)
  348. ("+=" . +=)
  349. ("-=" . -=)
  350. ("*=" . *=)
  351. ("%=" . %=)
  352. ("<<=" . <<=)
  353. (">>=" . >>=)
  354. (">>>=" . >>>=)
  355. ("&=" . &=)
  356. ("|=" . bor=)
  357. ("^=" . ^=)))
  358. (define *div-punctuation*
  359. '(("/" . /)
  360. ("/=" . /=)))
  361. ;; node ::= (char (symbol | #f) node*)
  362. (define read-punctuation
  363. (let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*))
  364. (cond ((null? puncs)
  365. nodes)
  366. ((assv-ref nodes (string-ref (caar puncs) 0))
  367. => (lambda (node-tail)
  368. (if (= (string-length (caar puncs)) 1)
  369. (set-car! node-tail (cdar puncs))
  370. (set-cdr! node-tail
  371. (lp (cdr node-tail)
  372. `((,(substring (caar puncs) 1)
  373. . ,(cdar puncs))))))
  374. (lp nodes (cdr puncs))))
  375. (else
  376. (lp (cons (list (string-ref (caar puncs) 0) #f) nodes)
  377. puncs))))))
  378. (lambda (port loc)
  379. (let lp ((c (peek-char port)) (tree punc-tree) (candidate #f))
  380. (cond
  381. ((assv-ref tree c)
  382. => (lambda (node-tail)
  383. (read-char port)
  384. (lp (peek-char port) (cdr node-tail) (car node-tail))))
  385. (candidate
  386. (make-lexical-token candidate loc #f))
  387. (else
  388. (syntax-error "bad syntax: character not allowed" loc c)))))))
  389. (define (next-token port div?)
  390. (let ((c (peek-char port))
  391. (loc (port-source-location port)))
  392. (case c
  393. ((#\ht #\vt #\np #\space #\x00A0) ; whitespace
  394. (read-char port)
  395. (next-token port div?))
  396. ((#\newline #\cr) ; line break
  397. (read-char port)
  398. (next-token port div?))
  399. ((#\/)
  400. ;; division, single comment, double comment, or regexp
  401. (read-slash port loc div?))
  402. ((#\" #\') ; string literal
  403. (read-string port loc))
  404. (else
  405. (cond
  406. ((eof-object? c)
  407. '*eoi*)
  408. ((or (char-alphabetic? c)
  409. (char=? c #\$)
  410. (char=? c #\_))
  411. ;; reserved word or identifier
  412. (read-identifier port loc))
  413. ((char-numeric? c)
  414. ;; numeric -- also accept . FIXME, requires lookahead
  415. (make-lexical-token 'NumericLiteral loc (read-numeric port loc)))
  416. (else
  417. ;; punctuation
  418. (read-punctuation port loc)))))))
  419. (define (make-tokenizer port)
  420. (let ((div? #f))
  421. (lambda ()
  422. (let ((tok (next-token port div?)))
  423. (set! div? (and (lexical-token? tok)
  424. (let ((cat (lexical-token-category tok)))
  425. (or (eq? cat 'Identifier)
  426. (eq? cat 'NumericLiteral)
  427. (eq? cat 'StringLiteral)))))
  428. tok))))
  429. (define (make-tokenizer/1 port)
  430. (let ((div? #f)
  431. (eoi? #f)
  432. (stack '()))
  433. (lambda ()
  434. (if eoi?
  435. '*eoi*
  436. (let ((tok (next-token port div?)))
  437. (case (if (lexical-token? tok) (lexical-token-category tok) tok)
  438. ((lparen)
  439. (set! stack (cons tok stack)))
  440. ((rparen)
  441. (if (and (pair? stack)
  442. (eq? (lexical-token-category (car stack)) 'lparen))
  443. (set! stack (cdr stack))
  444. (syntax-error "unexpected right parenthesis"
  445. (lexical-token-source tok)
  446. #f)))
  447. ((lbracket)
  448. (set! stack (cons tok stack)))
  449. ((rbracket)
  450. (if (and (pair? stack)
  451. (eq? (lexical-token-category (car stack)) 'lbracket))
  452. (set! stack (cdr stack))
  453. (syntax-error "unexpected right bracket"
  454. (lexical-token-source tok)
  455. #f)))
  456. ((lbrace)
  457. (set! stack (cons tok stack)))
  458. ((rbrace)
  459. (if (and (pair? stack)
  460. (eq? (lexical-token-category (car stack)) 'lbrace))
  461. (set! stack (cdr stack))
  462. (syntax-error "unexpected right brace"
  463. (lexical-token-source tok)
  464. #f)))
  465. ((semicolon)
  466. (set! eoi? (null? stack))))
  467. (set! div? (and (lexical-token? tok)
  468. (let ((cat (lexical-token-category tok)))
  469. (or (eq? cat 'Identifier)
  470. (eq? cat 'NumericLiteral)
  471. (eq? cat 'StringLiteral)))))
  472. tok)))))
  473. (define (tokenize port)
  474. (let ((next (make-tokenizer port)))
  475. (let lp ((out '()))
  476. (let ((tok (next)))
  477. (if (eq? tok '*eoi*)
  478. (reverse! out)
  479. (lp (cons tok out)))))))
  480. (define (tokenize/1 port)
  481. (let ((next (make-tokenizer/1 port)))
  482. (let lp ((out '()))
  483. (let ((tok (next)))
  484. (if (eq? tok '*eoi*)
  485. (reverse! out)
  486. (lp (cons tok out)))))))