nimlexbase.nim 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. #
  2. #
  3. # The Nim Compiler
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. # Base Object of a lexer with efficient buffer handling. In fact
  10. # I believe that this is the most efficient method of buffer
  11. # handling that exists! Only at line endings checks are necessary
  12. # if the buffer needs refilling.
  13. import
  14. llstream, strutils
  15. const
  16. Lrz* = ' '
  17. Apo* = '\''
  18. Tabulator* = '\x09'
  19. ESC* = '\x1B'
  20. CR* = '\x0D'
  21. FF* = '\x0C'
  22. LF* = '\x0A'
  23. BEL* = '\x07'
  24. BACKSPACE* = '\x08'
  25. VT* = '\x0B'
  26. const
  27. EndOfFile* = '\0' # end of file marker
  28. # A little picture makes everything clear :-)
  29. # buf:
  30. # "Example Text\n ha!" bufLen = 17
  31. # ^pos = 0 ^ sentinel = 12
  32. #
  33. NewLines* = {CR, LF}
  34. type
  35. TBaseLexer* = object of RootObj
  36. bufpos*: int
  37. buf*: cstring
  38. bufStorage: string
  39. bufLen: int
  40. stream*: PLLStream # we read from this stream
  41. lineNumber*: int # the current line number
  42. # private data:
  43. sentinel*: int
  44. lineStart*: int # index of last line start in buffer
  45. offsetBase*: int # use ``offsetBase + bufpos`` to get the offset
  46. proc openBaseLexer*(L: var TBaseLexer, inputstream: PLLStream,
  47. bufLen: int = 8192)
  48. # 8K is a reasonable buffer size
  49. proc closeBaseLexer*(L: var TBaseLexer)
  50. proc getCurrentLine*(L: TBaseLexer, marker: bool = true): string
  51. proc getColNumber*(L: TBaseLexer, pos: int): int
  52. proc handleCR*(L: var TBaseLexer, pos: int): int
  53. # Call this if you scanned over CR in the buffer; it returns the
  54. # position to continue the scanning from. `pos` must be the position
  55. # of the CR.
  56. proc handleLF*(L: var TBaseLexer, pos: int): int
  57. # Call this if you scanned over LF in the buffer; it returns the the
  58. # position to continue the scanning from. `pos` must be the position
  59. # of the LF.
  60. # implementation
  61. proc closeBaseLexer(L: var TBaseLexer) =
  62. llStreamClose(L.stream)
  63. proc fillBuffer(L: var TBaseLexer) =
  64. var
  65. charsRead, toCopy, s: int # all are in characters,
  66. # not bytes (in case this
  67. # is not the same)
  68. oldBufLen: int
  69. # we know here that pos == L.sentinel, but not if this proc
  70. # is called the first time by initBaseLexer()
  71. assert(L.sentinel < L.bufLen)
  72. toCopy = L.bufLen - L.sentinel - 1
  73. assert(toCopy >= 0)
  74. if toCopy > 0:
  75. moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy)
  76. # "moveMem" handles overlapping regions
  77. charsRead = llStreamRead(L.stream, addr L.buf[toCopy], L.sentinel + 1)
  78. s = toCopy + charsRead
  79. if charsRead < L.sentinel + 1:
  80. L.buf[s] = EndOfFile # set end marker
  81. L.sentinel = s
  82. else:
  83. # compute sentinel:
  84. dec(s) # BUGFIX (valgrind)
  85. while true:
  86. assert(s < L.bufLen)
  87. while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
  88. if s >= 0:
  89. # we found an appropriate character for a sentinel:
  90. L.sentinel = s
  91. break
  92. else:
  93. # rather than to give up here because the line is too long,
  94. # double the buffer's size and try again:
  95. oldBufLen = L.bufLen
  96. L.bufLen = L.bufLen * 2
  97. L.bufStorage.setLen(L.bufLen)
  98. L.buf = L.bufStorage
  99. assert(L.bufLen - oldBufLen == oldBufLen)
  100. charsRead = llStreamRead(L.stream, addr(L.buf[oldBufLen]),
  101. oldBufLen)
  102. if charsRead < oldBufLen:
  103. L.buf[oldBufLen + charsRead] = EndOfFile
  104. L.sentinel = oldBufLen + charsRead
  105. break
  106. s = L.bufLen - 1
  107. proc fillBaseLexer(L: var TBaseLexer, pos: int): int =
  108. assert(pos <= L.sentinel)
  109. if pos < L.sentinel:
  110. result = pos + 1 # nothing to do
  111. else:
  112. fillBuffer(L)
  113. L.offsetBase += pos + 1
  114. L.bufpos = 0
  115. result = 0
  116. L.lineStart = result
  117. proc handleCR(L: var TBaseLexer, pos: int): int =
  118. assert(L.buf[pos] == CR)
  119. inc(L.lineNumber)
  120. result = fillBaseLexer(L, pos)
  121. if L.buf[result] == LF:
  122. result = fillBaseLexer(L, result)
  123. proc handleLF(L: var TBaseLexer, pos: int): int =
  124. assert(L.buf[pos] == LF)
  125. inc(L.lineNumber)
  126. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  127. proc skipUTF8BOM(L: var TBaseLexer) =
  128. if L.buf[0] == '\xEF' and L.buf[1] == '\xBB' and L.buf[2] == '\xBF':
  129. inc(L.bufpos, 3)
  130. inc(L.lineStart, 3)
  131. proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen = 8192) =
  132. assert(bufLen > 0)
  133. L.bufpos = 0
  134. L.offsetBase = 0
  135. L.bufStorage = newString(bufLen)
  136. L.buf = L.bufStorage
  137. L.bufLen = bufLen
  138. L.sentinel = bufLen - 1
  139. L.lineStart = 0
  140. L.lineNumber = 1 # lines start at 1
  141. L.stream = inputstream
  142. fillBuffer(L)
  143. skipUTF8BOM(L)
  144. proc getColNumber(L: TBaseLexer, pos: int): int =
  145. result = abs(pos - L.lineStart)
  146. proc getCurrentLine(L: TBaseLexer, marker: bool = true): string =
  147. result = ""
  148. var i = L.lineStart
  149. while L.buf[i] notin {CR, LF, EndOfFile}:
  150. result.add L.buf[i]
  151. inc i
  152. result.add "\n"
  153. if marker:
  154. result.add spaces(getColNumber(L, L.bufpos)) & '^' & "\n"