lexbase.nim 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. #
  2. #
  3. # The Nim Compiler
  4. # (c) Copyright 2009 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a base object of a lexer with efficient buffer
  10. ## handling. Only at line endings checks are necessary if the buffer
  11. ## needs refilling.
  12. import
  13. strutils, streams
  14. const
  15. EndOfFile* = '\0' ## end of file marker
  16. NewLines* = {'\c', '\L'}
  17. # Buffer handling:
  18. # buf:
  19. # "Example Text\n ha!" bufLen = 17
  20. # ^pos = 0 ^ sentinel = 12
  21. #
  22. type
  23. BaseLexer* = object of RootObj ## the base lexer. Inherit your lexer from
  24. ## this object.
  25. bufpos*: int ## the current position within the buffer
  26. when defined(js): ## the buffer itself
  27. buf*: string
  28. else:
  29. buf*: cstring
  30. bufLen*: int ## length of buffer in characters
  31. input: Stream ## the input stream
  32. lineNumber*: int ## the current line number
  33. sentinel: int
  34. lineStart: int # index of last line start in buffer
  35. offsetBase*: int # use ``offsetBase + bufpos`` to get the offset
  36. refillChars: set[char]
  37. const
  38. chrSize = sizeof(char)
  39. proc close*(L: var BaseLexer) =
  40. ## closes the base lexer. This closes `L`'s associated stream too.
  41. when not defined(js):
  42. dealloc(L.buf)
  43. close(L.input)
  44. proc fillBuffer(L: var BaseLexer) =
  45. var
  46. charsRead, toCopy, s: int # all are in characters,
  47. # not bytes (in case this
  48. # is not the same)
  49. oldBufLen: int
  50. # we know here that pos == L.sentinel, but not if this proc
  51. # is called the first time by initBaseLexer()
  52. assert(L.sentinel < L.bufLen)
  53. toCopy = L.bufLen - L.sentinel - 1
  54. assert(toCopy >= 0)
  55. if toCopy > 0:
  56. when defined(js):
  57. for i in 0 ..< toCopy: L.buf[i] = L.buf[L.sentinel + 1 + i]
  58. else:
  59. # "moveMem" handles overlapping regions
  60. moveMem(L.buf, addr L.buf[L.sentinel + 1], toCopy * chrSize)
  61. charsRead = readData(L.input, addr(L.buf[toCopy]),
  62. (L.sentinel + 1) * chrSize) div chrSize
  63. s = toCopy + charsRead
  64. if charsRead < L.sentinel + 1:
  65. L.buf[s] = EndOfFile # set end marker
  66. L.sentinel = s
  67. else:
  68. # compute sentinel:
  69. dec(s) # BUGFIX (valgrind)
  70. while true:
  71. assert(s < L.bufLen)
  72. while s >= 0 and L.buf[s] notin L.refillChars: dec(s)
  73. if s >= 0:
  74. # we found an appropriate character for a sentinel:
  75. L.sentinel = s
  76. break
  77. else:
  78. # rather than to give up here because the line is too long,
  79. # double the buffer's size and try again:
  80. oldBufLen = L.bufLen
  81. L.bufLen = L.bufLen * 2
  82. when defined(js):
  83. L.buf.setLen(L.bufLen)
  84. else:
  85. L.buf = cast[cstring](realloc(L.buf, L.bufLen * chrSize))
  86. assert(L.bufLen - oldBufLen == oldBufLen)
  87. charsRead = readData(L.input, addr(L.buf[oldBufLen]),
  88. oldBufLen * chrSize) div chrSize
  89. if charsRead < oldBufLen:
  90. L.buf[oldBufLen + charsRead] = EndOfFile
  91. L.sentinel = oldBufLen + charsRead
  92. break
  93. s = L.bufLen - 1
  94. proc fillBaseLexer(L: var BaseLexer, pos: int): int =
  95. assert(pos <= L.sentinel)
  96. if pos < L.sentinel:
  97. result = pos + 1 # nothing to do
  98. else:
  99. fillBuffer(L)
  100. L.offsetBase += pos
  101. L.bufpos = 0
  102. result = 0
  103. proc handleCR*(L: var BaseLexer, pos: int): int =
  104. ## Call this if you scanned over '\c' in the buffer; it returns the the
  105. ## position to continue the scanning from. `pos` must be the position
  106. ## of the '\c'.
  107. assert(L.buf[pos] == '\c')
  108. inc(L.lineNumber)
  109. result = fillBaseLexer(L, pos)
  110. if L.buf[result] == '\L':
  111. result = fillBaseLexer(L, result)
  112. L.lineStart = result
  113. proc handleLF*(L: var BaseLexer, pos: int): int =
  114. ## Call this if you scanned over '\L' in the buffer; it returns the the
  115. ## position to continue the scanning from. `pos` must be the position
  116. ## of the '\L'.
  117. assert(L.buf[pos] == '\L')
  118. inc(L.lineNumber)
  119. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  120. L.lineStart = result
  121. proc handleRefillChar*(L: var BaseLexer, pos: int): int =
  122. ## To be documented.
  123. assert(L.buf[pos] in L.refillChars)
  124. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  125. proc skipUtf8Bom(L: var BaseLexer) =
  126. if (L.buf[0] == '\xEF') and (L.buf[1] == '\xBB') and (L.buf[2] == '\xBF'):
  127. inc(L.bufpos, 3)
  128. inc(L.lineStart, 3)
  129. proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192;
  130. refillChars: set[char] = NewLines) =
  131. ## inits the BaseLexer with a stream to read from.
  132. assert(bufLen > 0)
  133. assert(input != nil)
  134. L.input = input
  135. L.bufpos = 0
  136. L.offsetBase = 0
  137. L.bufLen = bufLen
  138. L.refillChars = refillChars
  139. when defined(js):
  140. L.buf = newString(bufLen)
  141. else:
  142. L.buf = cast[cstring](alloc(bufLen * chrSize))
  143. L.sentinel = bufLen - 1
  144. L.lineStart = 0
  145. L.lineNumber = 1 # lines start at 1
  146. fillBuffer(L)
  147. skipUtf8Bom(L)
  148. proc getColNumber*(L: BaseLexer, pos: int): int =
  149. ## retrieves the current column.
  150. result = abs(pos - L.lineStart)
  151. proc getCurrentLine*(L: BaseLexer, marker: bool = true): string =
  152. ## retrieves the current line.
  153. var i: int
  154. result = ""
  155. i = L.lineStart
  156. while not (L.buf[i] in {'\c', '\L', EndOfFile}):
  157. add(result, L.buf[i])
  158. inc(i)
  159. add(result, "\n")
  160. if marker:
  161. add(result, spaces(getColNumber(L, L.bufpos)) & "^\n")