lexer.rkt 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. #lang racket/base
  2. (provide
  3. (struct-out group)
  4. (struct-out literal)
  5. (struct-out sep)
  6. (struct-out id)
  7. (struct-out stx)
  8. lexer? port->lexer generator->lexer sequence->lexer
  9. lex lex-list
  10. lexer-literal-parsers BAD-PARSE)
  11. (require
  12. threading
  13. racket/match
  14. racket/format
  15. racket/string
  16. (only-in racket/generator generator yield)
  17. syntax/parse/define
  18. (for-syntax racket/base))
  19. ;; ==========================
  20. ;; LEXER AST
  21. ;; Lexer "tokens" (still a tree tho)
  22. (struct group [style tokens] #:transparent)
  23. (struct literal [value] #:transparent)
  24. (struct sep [str] #:transparent)
  25. (struct id [symbol] #:transparent)
  26. ;; Wrap additional metadata around token. This struct should be extended to attach new types of info
  27. ;; (e.g.: syntax properties, scope sets).
  28. (struct stx [token srcpos])
  29. ;; ==========================
  30. ;; LEXER CONFIG
  31. (define BAD-PARSE
  32. (let () (struct bad-parse []) (bad-parse)))
  33. ;; string -> (or exact-integer BAD-PARSE)
  34. (define (parse-integer s)
  35. (match (string->number s)
  36. [(? exact-integer? n) n]
  37. [_ BAD-PARSE]))
  38. ;; [listof [string -> (or value BAD-PARSE)]]
  39. (define lexer-literal-parsers
  40. (make-parameter
  41. (list parse-integer)))
  42. ;; ==========================
  43. ;; LEXING
  44. (struct lx:atom [token] #:transparent)
  45. (struct lx:open [char] #:transparent)
  46. (struct lx:close [char] #:transparent)
  47. (struct lx:eof [] #:transparent)
  48. ;; (reader port
  49. ;; #:whitespace regexp-string
  50. ;; [regexp-string capture-id re-match-body ...]
  51. ;; ...
  52. ;; [id ident-match-body ...]
  53. ;; [#:eof eof-body ...])
  54. ;;
  55. ;; Returns a zero-arg generator function that reads from 'port'.
  56. ;; The generator returns 're-match-body' if one of the regexp matches the next sequence in the port
  57. ;; (with capture-id bound to the string captured by the regexp).
  58. ;; The generator returns 'ident-match-body' if no regexp matches, where 'id' is bound to the string
  59. ;; up until that point.
  60. ;; The generator returns 'eof-body' when the end of file is encountered.
  61. (define-simple-macro (build-reader port-expr
  62. #:whitespace rx-ws-str-expr:expr
  63. [rx-str-expr:expr rx-cap:id rx-rhs ...+]
  64. ...
  65. [identifier:id ident-rhs ...+]
  66. [#:eof eof-rhs ...+])
  67. #:with [cap-range ...] (generate-temporaries #'[rx-cap ...])
  68. (let ([port port-expr]
  69. [rx (~> (list (string-append "(" rx-str-expr ")") ...) ;; Make each regexp a capture group
  70. (string-join _ "|") ;; Combine with |
  71. (format "(~a)|~a|$" rx-ws-str-expr _) ;; Match whitespace and EOF
  72. pregexp)])
  73. (generator ()
  74. (let loop ()
  75. (match-define
  76. ;; Bind "capture ranges" from running the regexp on the port
  77. (list (cons start end)
  78. ws-cap-range
  79. cap-range
  80. ...)
  81. (regexp-match-peek-positions rx port))
  82. (let* ([before (bytes->string/utf-8 (read-bytes start port))]
  83. [matched (bytes->string/utf-8 (read-bytes (- end start) port))])
  84. ;; If before is non-empty, then treat that as an identifier
  85. (unless (zero? start)
  86. (yield (let ([identifier before]) ident-rhs ...)))
  87. (cond [ws-cap-range
  88. ;; Ignore whitespace
  89. (loop)]
  90. [cap-range
  91. ;; Defer to appropriate rx-rhs if the corresponding cap-range is not #f.
  92. (yield (let ([rx-cap matched]) rx-rhs ...))
  93. (loop)]
  94. ...
  95. [else ;; No match so must be EOF
  96. eof-rhs ...]))))))
  97. ;; (lexer [-> lx:*])
  98. (struct lexer [reader])
  99. ;; Create a new lexer that parses from given port.
  100. ;; -> lexer
  101. (define (port->lexer port)
  102. (define lit-parsers (lexer-literal-parsers))
  103. (lexer
  104. (build-reader port
  105. #:whitespace "\\s+"
  106. ["[([{]" s (lx:open (string-ref s 0))]
  107. ["[)\\]}]" s (lx:close (string-ref s 0))]
  108. ["[,:;=|.]" s (lx:atom (sep s))]
  109. [ident
  110. (or (for*/first ([parse (in-list lit-parsers)]
  111. [v (in-value (parse ident))]
  112. #:when (not (eq? v BAD-PARSE)))
  113. (lx:atom (literal v)))
  114. (lx:atom (id ident)))]
  115. [#:eof
  116. (lx:eof)])))
  117. ;; Create a new lexer that uses the given generator.
  118. ;; [-> lx:*] -> lexer
  119. (define (generator->lexer gen)
  120. (lexer gen))
  121. ;; Create a new lexer that uses the given sequence.
  122. ;; [sequenceof lx*:] -> lexer
  123. (define (sequence->lexer seq)
  124. (lexer (generator ()
  125. (for ([x seq]) (yield x))
  126. (let loop ()
  127. (yield (lx:eof))
  128. (loop)))))
  129. ;; lexer char -> char
  130. (define (lexer-close-char _lx open-chr)
  131. (case open-chr
  132. [(#\() #\)]
  133. [(#\{) #\}]
  134. [(#\[) #\]]))
  135. ;; lexer -> (or lx:atom lx:open lx:close lx:eof)
  136. (define (lex lx)
  137. ((lexer-reader lx)))
  138. ;; lexer (or char 'eof) -> [listof token]
  139. (define (lex-list lx closing)
  140. (match (lex lx)
  141. [(lx:atom tok)
  142. (cons tok (lex-list lx closing))]
  143. [(lx:open chr)
  144. (define close-chr
  145. (lexer-close-char lx chr))
  146. (define tok
  147. (group chr (lex-list lx close-chr)))
  148. (cons tok (lex-list lx closing))]
  149. [(lx:close chr)
  150. (when (eq? closing 'eof)
  151. (error (~a "unexpected closing paren: '" chr "'")))
  152. (unless (eqv? chr closing)
  153. (error (~a "unexpected closing paren '" chr "', expected '" closing "'")))
  154. '()]
  155. [(lx:eof)
  156. (unless (eq? closing 'eof)
  157. (error (~a "did not find expected closing paren '" closing "'")))
  158. '()]))
  159. (module+ test
  160. (require
  161. rackunit)
  162. (let ([lx (port->lexer
  163. (open-input-string "1 hello: (a b) {a, [b=c]}"))])
  164. (check-equal?
  165. (lex-list lx 'eof)
  166. (list (literal 1)
  167. (id "hello")
  168. (sep ":")
  169. (group #\( (list (id "a") (id "b")))
  170. (group #\{ (list (id "a")
  171. (sep ",")
  172. (group #\[ (list (id "b")
  173. (sep "=")
  174. (id "c"))))))))
  175. (let ([lx (sequence->lexer
  176. ;; "1 (a)"
  177. (list (lx:atom (literal 1))
  178. (lx:open #\()
  179. (lx:atom (id "a"))
  180. (lx:close #\))))])
  181. (check-equal?
  182. (lex-list lx 'eof)
  183. (list (literal 1)
  184. (group #\( (list (id "a")))))))