123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- #lang racket/base
- (provide
- (struct-out group)
- (struct-out literal)
- (struct-out sep)
- (struct-out id)
- (struct-out stx)
- lexer? port->lexer generator->lexer sequence->lexer
- lex lex-list
- lexer-literal-parsers BAD-PARSE)
- (require
- threading
- racket/match
- racket/format
- racket/string
- (only-in racket/generator generator yield)
- syntax/parse/define
- (for-syntax racket/base))
- ;; ==========================
- ;; LEXER AST
- ;; Lexer "tokens" (still a tree tho)
- (struct group [style tokens] #:transparent)
- (struct literal [value] #:transparent)
- (struct sep [str] #:transparent)
- (struct id [symbol] #:transparent)
- ;; Wrap additional metadata around token. This struct should be extended to attach new types of info
- ;; (e.g.: syntax properties, scope sets).
- (struct stx [token srcpos])
- ;; ==========================
- ;; LEXER CONFIG
- (define BAD-PARSE
- (let () (struct bad-parse []) (bad-parse)))
- ;; string -> (or exact-integer BAD-PARSE)
- (define (parse-integer s)
- (match (string->number s)
- [(? exact-integer? n) n]
- [_ BAD-PARSE]))
- ;; [listof [string -> (or value BAD-PARSE)]]
- (define lexer-literal-parsers
- (make-parameter
- (list parse-integer)))
- ;; ==========================
- ;; LEXING
- (struct lx:atom [token] #:transparent)
- (struct lx:open [char] #:transparent)
- (struct lx:close [char] #:transparent)
- (struct lx:eof [] #:transparent)
- ;; (reader port
- ;; #:whitespace regexp-string
- ;; [regexp-string capture-id re-match-body ...]
- ;; ...
- ;; [id ident-match-body ...]
- ;; [#:eof eof-body ...])
- ;;
- ;; Returns a zero-arg generator function that reads from 'port'.
- ;; The generator returns 're-match-body' if one of the regexp matches the next sequence in the port
- ;; (with capture-id bound to the string captured by the regexp).
- ;; The generator returns 'ident-match-body' if no regexp matches, where 'id' is bound to the string
- ;; up until that point.
- ;; The generator returns 'eof-body' when the end of file is encountered.
- (define-simple-macro (build-reader port-expr
- #:whitespace rx-ws-str-expr:expr
- [rx-str-expr:expr rx-cap:id rx-rhs ...+]
- ...
- [identifier:id ident-rhs ...+]
- [#:eof eof-rhs ...+])
- #:with [cap-range ...] (generate-temporaries #'[rx-cap ...])
- (let ([port port-expr]
- [rx (~> (list (string-append "(" rx-str-expr ")") ...) ;; Make each regexp a capture group
- (string-join _ "|") ;; Combine with |
- (format "(~a)|~a|$" rx-ws-str-expr _) ;; Match whitespace and EOF
- pregexp)])
- (generator ()
- (let loop ()
- (match-define
- ;; Bind "capture ranges" from running the regexp on the port
- (list (cons start end)
- ws-cap-range
- cap-range
- ...)
- (regexp-match-peek-positions rx port))
- (let* ([before (bytes->string/utf-8 (read-bytes start port))]
- [matched (bytes->string/utf-8 (read-bytes (- end start) port))])
- ;; If before is non-empty, then treat that as an identifier
- (unless (zero? start)
- (yield (let ([identifier before]) ident-rhs ...)))
- (cond [ws-cap-range
- ;; Ignore whitespace
- (loop)]
- [cap-range
- ;; Defer to appropriate rx-rhs if the corresponding cap-range is not #f.
- (yield (let ([rx-cap matched]) rx-rhs ...))
- (loop)]
- ...
- [else ;; No match so must be EOF
- eof-rhs ...]))))))
- ;; (lexer [-> lx:*])
- (struct lexer [reader])
- ;; Create a new lexer that parses from given port.
- ;; -> lexer
- (define (port->lexer port)
- (define lit-parsers (lexer-literal-parsers))
- (lexer
- (build-reader port
- #:whitespace "\\s+"
- ["[([{]" s (lx:open (string-ref s 0))]
- ["[)\\]}]" s (lx:close (string-ref s 0))]
- ["[,:;=|.]" s (lx:atom (sep s))]
- [ident
- (or (for*/first ([parse (in-list lit-parsers)]
- [v (in-value (parse ident))]
- #:when (not (eq? v BAD-PARSE)))
- (lx:atom (literal v)))
- (lx:atom (id ident)))]
- [#:eof
- (lx:eof)])))
- ;; Create a new lexer that uses the given generator.
- ;; [-> lx:*] -> lexer
- (define (generator->lexer gen)
- (lexer gen))
- ;; Create a new lexer that uses the given sequence.
- ;; [sequenceof lx*:] -> lexer
- (define (sequence->lexer seq)
- (lexer (generator ()
- (for ([x seq]) (yield x))
- (let loop ()
- (yield (lx:eof))
- (loop)))))
- ;; lexer char -> char
- (define (lexer-close-char _lx open-chr)
- (case open-chr
- [(#\() #\)]
- [(#\{) #\}]
- [(#\[) #\]]))
- ;; lexer -> (or lx:atom lx:open lx:close lx:eof)
- (define (lex lx)
- ((lexer-reader lx)))
- ;; lexer (or char 'eof) -> [listof token]
- (define (lex-list lx closing)
- (match (lex lx)
- [(lx:atom tok)
- (cons tok (lex-list lx closing))]
- [(lx:open chr)
- (define close-chr
- (lexer-close-char lx chr))
- (define tok
- (group chr (lex-list lx close-chr)))
- (cons tok (lex-list lx closing))]
- [(lx:close chr)
- (when (eq? closing 'eof)
- (error (~a "unexpected closing paren: '" chr "'")))
- (unless (eqv? chr closing)
- (error (~a "unexpected closing paren '" chr "', expected '" closing "'")))
- '()]
- [(lx:eof)
- (unless (eq? closing 'eof)
- (error (~a "did not find expected closing paren '" closing "'")))
- '()]))
- (module+ test
- (require
- rackunit)
- (let ([lx (port->lexer
- (open-input-string "1 hello: (a b) {a, [b=c]}"))])
- (check-equal?
- (lex-list lx 'eof)
- (list (literal 1)
- (id "hello")
- (sep ":")
- (group #\( (list (id "a") (id "b")))
- (group #\{ (list (id "a")
- (sep ",")
- (group #\[ (list (id "b")
- (sep "=")
- (id "c"))))))))
- (let ([lx (sequence->lexer
- ;; "1 (a)"
- (list (lx:atom (literal 1))
- (lx:open #\()
- (lx:atom (id "a"))
- (lx:close #\))))])
- (check-equal?
- (lex-list lx 'eof)
- (list (literal 1)
- (group #\( (list (id "a")))))))
|