123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- #
- #
- # The Nim Compiler
- # (c) Copyright 2012 Andreas Rumpf
- #
- # See the file "copying.txt", included in this
- # distribution, for details about the copyright.
- #
- # Base Object of a lexer with efficient buffer handling. In fact
- # I believe that this is the most efficient method of buffer
- # handling that exists! Only at line endings checks are necessary
- # if the buffer needs refilling.
- import
- llstream, strutils
- const
- Lrz* = ' '
- Apo* = '\''
- Tabulator* = '\x09'
- ESC* = '\x1B'
- CR* = '\x0D'
- FF* = '\x0C'
- LF* = '\x0A'
- BEL* = '\x07'
- BACKSPACE* = '\x08'
- VT* = '\x0B'
- const
- EndOfFile* = '\0' # end of file marker
- # A little picture makes everything clear :-)
- # buf:
- # "Example Text\n ha!" bufLen = 17
- # ^pos = 0 ^ sentinel = 12
- #
- NewLines* = {CR, LF}
- type
- TBaseLexer* = object of RootObj
- bufpos*: int
- buf*: cstring
- bufLen*: int # length of buffer in characters
- stream*: PLLStream # we read from this stream
- lineNumber*: int # the current line number
- # private data:
- sentinel*: int
- lineStart*: int # index of last line start in buffer
- offsetBase*: int # use ``offsetBase + bufpos`` to get the offset
- proc openBaseLexer*(L: var TBaseLexer, inputstream: PLLStream,
- bufLen: int = 8192)
- # 8K is a reasonable buffer size
- proc closeBaseLexer*(L: var TBaseLexer)
- proc getCurrentLine*(L: TBaseLexer, marker: bool = true): string
- proc getColNumber*(L: TBaseLexer, pos: int): int
- proc handleCR*(L: var TBaseLexer, pos: int): int
- # Call this if you scanned over CR in the buffer; it returns the
- # position to continue the scanning from. `pos` must be the position
- # of the CR.
- proc handleLF*(L: var TBaseLexer, pos: int): int
- # Call this if you scanned over LF in the buffer; it returns the the
- # position to continue the scanning from. `pos` must be the position
- # of the LF.
- # implementation
- const
- chrSize = sizeof(char)
- proc closeBaseLexer(L: var TBaseLexer) =
- dealloc(L.buf)
- llStreamClose(L.stream)
- proc fillBuffer(L: var TBaseLexer) =
- var
- charsRead, toCopy, s: int # all are in characters,
- # not bytes (in case this
- # is not the same)
- oldBufLen: int
- # we know here that pos == L.sentinel, but not if this proc
- # is called the first time by initBaseLexer()
- assert(L.sentinel < L.bufLen)
- toCopy = L.bufLen - L.sentinel - 1
- assert(toCopy >= 0)
- if toCopy > 0:
- moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize)
- # "moveMem" handles overlapping regions
- charsRead = llStreamRead(L.stream, addr(L.buf[toCopy]),
- (L.sentinel + 1) * chrSize) div chrSize
- s = toCopy + charsRead
- if charsRead < L.sentinel + 1:
- L.buf[s] = EndOfFile # set end marker
- L.sentinel = s
- else:
- # compute sentinel:
- dec(s) # BUGFIX (valgrind)
- while true:
- assert(s < L.bufLen)
- while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
- if s >= 0:
- # we found an appropriate character for a sentinel:
- L.sentinel = s
- break
- else:
- # rather than to give up here because the line is too long,
- # double the buffer's size and try again:
- oldBufLen = L.bufLen
- L.bufLen = L.bufLen * 2
- L.buf = cast[cstring](realloc(L.buf, L.bufLen * chrSize))
- assert(L.bufLen - oldBufLen == oldBufLen)
- charsRead = llStreamRead(L.stream, addr(L.buf[oldBufLen]),
- oldBufLen * chrSize) div chrSize
- if charsRead < oldBufLen:
- L.buf[oldBufLen + charsRead] = EndOfFile
- L.sentinel = oldBufLen + charsRead
- break
- s = L.bufLen - 1
- proc fillBaseLexer(L: var TBaseLexer, pos: int): int =
- assert(pos <= L.sentinel)
- if pos < L.sentinel:
- result = pos + 1 # nothing to do
- else:
- fillBuffer(L)
- L.offsetBase += pos + 1
- L.bufpos = 0
- result = 0
- L.lineStart = result
- proc handleCR(L: var TBaseLexer, pos: int): int =
- assert(L.buf[pos] == CR)
- inc(L.lineNumber)
- result = fillBaseLexer(L, pos)
- if L.buf[result] == LF:
- result = fillBaseLexer(L, result)
- proc handleLF(L: var TBaseLexer, pos: int): int =
- assert(L.buf[pos] == LF)
- inc(L.lineNumber)
- result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
- proc skipUTF8BOM(L: var TBaseLexer) =
- if L.buf[0] == '\xEF' and L.buf[1] == '\xBB' and L.buf[2] == '\xBF':
- inc(L.bufpos, 3)
- inc(L.lineStart, 3)
- proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen = 8192) =
- assert(bufLen > 0)
- L.bufpos = 0
- L.offsetBase = 0
- L.bufLen = bufLen
- L.buf = cast[cstring](alloc(bufLen * chrSize))
- L.sentinel = bufLen - 1
- L.lineStart = 0
- L.lineNumber = 1 # lines start at 1
- L.stream = inputstream
- fillBuffer(L)
- skipUTF8BOM(L)
- proc getColNumber(L: TBaseLexer, pos: int): int =
- result = abs(pos - L.lineStart)
- proc getCurrentLine(L: TBaseLexer, marker: bool = true): string =
- result = ""
- var i = L.lineStart
- while not (L.buf[i] in {CR, LF, EndOfFile}):
- add(result, L.buf[i])
- inc(i)
- result.add("\n")
- if marker:
- result.add(spaces(getColNumber(L, L.bufpos)) & '^' & "\n")
|