nimlexbase.nim 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. #
  2. #
  3. # The Nim Compiler
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. # Base Object of a lexer with efficient buffer handling. In fact
  10. # I believe that this is the most efficient method of buffer
  11. # handling that exists! Only at line endings checks are necessary
  12. # if the buffer needs refilling.
  13. import
  14. llstream, strutils
  15. const
  16. Lrz* = ' '
  17. Apo* = '\''
  18. Tabulator* = '\x09'
  19. ESC* = '\x1B'
  20. CR* = '\x0D'
  21. FF* = '\x0C'
  22. LF* = '\x0A'
  23. BEL* = '\x07'
  24. BACKSPACE* = '\x08'
  25. VT* = '\x0B'
  26. const
  27. EndOfFile* = '\0' # end of file marker
  28. # A little picture makes everything clear :-)
  29. # buf:
  30. # "Example Text\n ha!" bufLen = 17
  31. # ^pos = 0 ^ sentinel = 12
  32. #
  33. NewLines* = {CR, LF}
  34. type
  35. TBaseLexer* = object of RootObj
  36. bufpos*: int
  37. buf*: cstring
  38. bufLen*: int # length of buffer in characters
  39. stream*: PLLStream # we read from this stream
  40. lineNumber*: int # the current line number
  41. # private data:
  42. sentinel*: int
  43. lineStart*: int # index of last line start in buffer
  44. offsetBase*: int # use ``offsetBase + bufpos`` to get the offset
  45. proc openBaseLexer*(L: var TBaseLexer, inputstream: PLLStream,
  46. bufLen: int = 8192)
  47. # 8K is a reasonable buffer size
  48. proc closeBaseLexer*(L: var TBaseLexer)
  49. proc getCurrentLine*(L: TBaseLexer, marker: bool = true): string
  50. proc getColNumber*(L: TBaseLexer, pos: int): int
  51. proc handleCR*(L: var TBaseLexer, pos: int): int
  52. # Call this if you scanned over CR in the buffer; it returns the
  53. # position to continue the scanning from. `pos` must be the position
  54. # of the CR.
  55. proc handleLF*(L: var TBaseLexer, pos: int): int
  56. # Call this if you scanned over LF in the buffer; it returns the the
  57. # position to continue the scanning from. `pos` must be the position
  58. # of the LF.
  59. # implementation
  60. const
  61. chrSize = sizeof(char)
  62. proc closeBaseLexer(L: var TBaseLexer) =
  63. dealloc(L.buf)
  64. llStreamClose(L.stream)
  65. proc fillBuffer(L: var TBaseLexer) =
  66. var
  67. charsRead, toCopy, s: int # all are in characters,
  68. # not bytes (in case this
  69. # is not the same)
  70. oldBufLen: int
  71. # we know here that pos == L.sentinel, but not if this proc
  72. # is called the first time by initBaseLexer()
  73. assert(L.sentinel < L.bufLen)
  74. toCopy = L.bufLen - L.sentinel - 1
  75. assert(toCopy >= 0)
  76. if toCopy > 0:
  77. moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize)
  78. # "moveMem" handles overlapping regions
  79. charsRead = llStreamRead(L.stream, addr(L.buf[toCopy]),
  80. (L.sentinel + 1) * chrSize) div chrSize
  81. s = toCopy + charsRead
  82. if charsRead < L.sentinel + 1:
  83. L.buf[s] = EndOfFile # set end marker
  84. L.sentinel = s
  85. else:
  86. # compute sentinel:
  87. dec(s) # BUGFIX (valgrind)
  88. while true:
  89. assert(s < L.bufLen)
  90. while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
  91. if s >= 0:
  92. # we found an appropriate character for a sentinel:
  93. L.sentinel = s
  94. break
  95. else:
  96. # rather than to give up here because the line is too long,
  97. # double the buffer's size and try again:
  98. oldBufLen = L.bufLen
  99. L.bufLen = L.bufLen * 2
  100. L.buf = cast[cstring](realloc(L.buf, L.bufLen * chrSize))
  101. assert(L.bufLen - oldBufLen == oldBufLen)
  102. charsRead = llStreamRead(L.stream, addr(L.buf[oldBufLen]),
  103. oldBufLen * chrSize) div chrSize
  104. if charsRead < oldBufLen:
  105. L.buf[oldBufLen + charsRead] = EndOfFile
  106. L.sentinel = oldBufLen + charsRead
  107. break
  108. s = L.bufLen - 1
  109. proc fillBaseLexer(L: var TBaseLexer, pos: int): int =
  110. assert(pos <= L.sentinel)
  111. if pos < L.sentinel:
  112. result = pos + 1 # nothing to do
  113. else:
  114. fillBuffer(L)
  115. L.offsetBase += pos + 1
  116. L.bufpos = 0
  117. result = 0
  118. L.lineStart = result
  119. proc handleCR(L: var TBaseLexer, pos: int): int =
  120. assert(L.buf[pos] == CR)
  121. inc(L.lineNumber)
  122. result = fillBaseLexer(L, pos)
  123. if L.buf[result] == LF:
  124. result = fillBaseLexer(L, result)
  125. proc handleLF(L: var TBaseLexer, pos: int): int =
  126. assert(L.buf[pos] == LF)
  127. inc(L.lineNumber)
  128. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  129. proc skipUTF8BOM(L: var TBaseLexer) =
  130. if L.buf[0] == '\xEF' and L.buf[1] == '\xBB' and L.buf[2] == '\xBF':
  131. inc(L.bufpos, 3)
  132. inc(L.lineStart, 3)
  133. proc openBaseLexer(L: var TBaseLexer, inputstream: PLLStream, bufLen = 8192) =
  134. assert(bufLen > 0)
  135. L.bufpos = 0
  136. L.offsetBase = 0
  137. L.bufLen = bufLen
  138. L.buf = cast[cstring](alloc(bufLen * chrSize))
  139. L.sentinel = bufLen - 1
  140. L.lineStart = 0
  141. L.lineNumber = 1 # lines start at 1
  142. L.stream = inputstream
  143. fillBuffer(L)
  144. skipUTF8BOM(L)
  145. proc getColNumber(L: TBaseLexer, pos: int): int =
  146. result = abs(pos - L.lineStart)
  147. proc getCurrentLine(L: TBaseLexer, marker: bool = true): string =
  148. result = ""
  149. var i = L.lineStart
  150. while not (L.buf[i] in {CR, LF, EndOfFile}):
  151. add(result, L.buf[i])
  152. inc(i)
  153. result.add("\n")
  154. if marker:
  155. result.add(spaces(getColNumber(L, L.bufpos)) & '^' & "\n")