nimlexbase.nim 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #
  2. #
  3. # The Nim Compiler
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. # Base Object of a lexer with efficient buffer handling. In fact
  10. # I believe that this is the most efficient method of buffer
  11. # handling that exists! Only at line endings checks are necessary
  12. # if the buffer needs refilling.
  13. import
  14. llstream, strutils
  15. const
  16. Lrz* = ' '
  17. Apo* = '\''
  18. Tabulator* = '\x09'
  19. ESC* = '\x1B'
  20. CR* = '\x0D'
  21. FF* = '\x0C'
  22. LF* = '\x0A'
  23. BEL* = '\x07'
  24. BACKSPACE* = '\x08'
  25. VT* = '\x0B'
  26. const
  27. EndOfFile* = '\0' # end of file marker
  28. # A little picture makes everything clear :-)
  29. # buf:
  30. # "Example Text\n ha!" bufLen = 17
  31. # ^pos = 0 ^ sentinel = 12
  32. #
  33. NewLines* = {CR, LF}
  34. type
  35. TBaseLexer* = object of RootObj
  36. bufpos*: int
  37. buf*: string
  38. stream*: PLLStream # we read from this stream
  39. lineNumber*: int # the current line number
  40. # private data:
  41. sentinel*: int
  42. lineStart*: int # index of last line start in buffer
  43. offsetBase*: int # use ``offsetBase + bufpos`` to get the offset
  44. proc openBaseLexer*(L: var TBaseLexer, inputstream: PLLStream,
  45. bufLen: int = 8192)
  46. # 8K is a reasonable buffer size
  47. proc closeBaseLexer*(L: var TBaseLexer)
  48. proc getCurrentLine*(L: TBaseLexer, marker: bool = true): string
  49. proc getColNumber*(L: TBaseLexer, pos: int): int
  50. proc handleCR*(L: var TBaseLexer, pos: int): int
  51. # Call this if you scanned over CR in the buffer; it returns the
  52. # position to continue the scanning from. `pos` must be the position
  53. # of the CR.
  54. proc handleLF*(L: var TBaseLexer, pos: int): int
  55. # Call this if you scanned over LF in the buffer; it returns the the
  56. # position to continue the scanning from. `pos` must be the position
  57. # of the LF.
  58. # implementation
  59. proc closeBaseLexer(L: var TBaseLexer) =
  60. llStreamClose(L.stream)
  61. proc fillBuffer(L: var TBaseLexer) =
  62. var
  63. charsRead, toCopy, s: int # all are in characters,
  64. # not bytes (in case this
  65. # is not the same)
  66. oldBufLen: int
  67. # we know here that pos == L.sentinel, but not if this proc
  68. # is called the first time by initBaseLexer()
  69. assert(L.sentinel < L.buf.len)
  70. toCopy = L.buf.len - L.sentinel - 1
  71. assert(toCopy >= 0)
  72. if toCopy > 0:
  73. moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy)
  74. # "moveMem" handles overlapping regions
  75. charsRead = llStreamRead(L.stream, addr L.buf[toCopy], L.sentinel + 1)
  76. s = toCopy + charsRead
  77. if charsRead < L.sentinel + 1:
  78. L.buf[s] = EndOfFile # set end marker
  79. L.sentinel = s
  80. else:
  81. # compute sentinel:
  82. dec(s) # BUGFIX (valgrind)
  83. while true:
  84. assert(s < L.buf.len)
  85. while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
  86. if s >= 0:
  87. # we found an appropriate character for a sentinel:
  88. L.sentinel = s
  89. break
  90. else:
  91. # rather than to give up here because the line is too long,
  92. # double the buffer's size and try again:
  93. oldBufLen = L.buf.len
  94. L.buf.setLen(L.buf.len * 2)
  95. assert(L.buf.len - oldBufLen == oldBufLen)
  96. charsRead = llStreamRead(L.stream, addr(L.buf[oldBufLen]),
  97. oldBufLen)
  98. if charsRead < oldBufLen:
  99. L.buf[oldBufLen + charsRead] = EndOfFile
  100. L.sentinel = oldBufLen + charsRead
  101. break
  102. s = L.buf.len - 1
  103. proc fillBaseLexer(L: var TBaseLexer, pos: int): int =
  104. assert(pos <= L.sentinel)
  105. if pos < L.sentinel:
  106. result = pos + 1 # nothing to do
  107. else:
  108. fillBuffer(L)
  109. L.offsetBase += pos + 1
  110. L.bufpos = 0
  111. result = 0
  112. L.lineStart = result
  113. proc handleCR(L: var TBaseLexer, pos: int): int =
  114. assert(L.buf[pos] == CR)
  115. inc(L.lineNumber)
  116. result = fillBaseLexer(L, pos)
  117. if L.buf[result] == LF:
  118. result = fillBaseLexer(L, result)
  119. proc handleLF(L: var TBaseLexer, pos: int): int =
  120. assert(L.buf[pos] == LF)
  121. inc(L.lineNumber)
  122. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  123. proc skipUTF8BOM(L: var TBaseLexer) =
  124. if L.buf[0] == '\xEF' and L.buf[1] == '\xBB' and L.buf[2] == '\xBF':
  125. inc(L.bufpos, 3)
  126. inc(L.lineStart, 3)
  127. proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen = 8192) =
  128. assert(bufLen > 0)
  129. L.bufpos = 0
  130. L.offsetBase = 0
  131. L.buf = newString(bufLen)
  132. L.sentinel = bufLen - 1
  133. L.lineStart = 0
  134. L.lineNumber = 1 # lines start at 1
  135. L.stream = inputstream
  136. fillBuffer(L)
  137. skipUTF8BOM(L)
  138. proc getColNumber(L: TBaseLexer, pos: int): int =
  139. result = abs(pos - L.lineStart)
  140. proc getCurrentLine(L: TBaseLexer, marker: bool = true): string =
  141. result = ""
  142. var i = L.lineStart
  143. while not (L.buf[i] in {CR, LF, EndOfFile}):
  144. add(result, L.buf[i])
  145. inc(i)
  146. result.add("\n")
  147. if marker:
  148. result.add(spaces(getColNumber(L, L.bufpos)) & '^' & "\n")