lexbase.nim 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2009 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a base object of a lexer with efficient buffer
  10. ## handling. Only at line endings checks are necessary if the buffer
  11. ## needs refilling.
  12. import
  13. std/[strutils, streams]
  14. when defined(nimPreviewSlimSystem):
  15. import std/assertions
  16. const
  17. EndOfFile* = '\0' ## end of file marker
  18. NewLines* = {'\c', '\L'}
  19. # Buffer handling:
  20. # buf:
  21. # "Example Text\n ha!" bufLen = 17
  22. # ^pos = 0 ^ sentinel = 12
  23. #
  24. type
  25. BaseLexer* = object of RootObj ## the base lexer. Inherit your lexer from
  26. ## this object.
  27. bufpos*: int ## the current position within the buffer
  28. buf*: string ## the buffer itself
  29. input: Stream ## the input stream
  30. lineNumber*: int ## the current line number
  31. sentinel: int
  32. lineStart: int # index of last line start in buffer
  33. offsetBase*: int # use `offsetBase + bufpos` to get the offset
  34. refillChars: set[char]
  35. proc close*(L: var BaseLexer) =
  36. ## closes the base lexer. This closes `L`'s associated stream too.
  37. close(L.input)
  38. proc fillBuffer(L: var BaseLexer) =
  39. var
  40. charsRead, toCopy, s: int # all are in characters,
  41. # not bytes (in case this
  42. # is not the same)
  43. oldBufLen: int
  44. # we know here that pos == L.sentinel, but not if this proc
  45. # is called the first time by initBaseLexer()
  46. assert(L.sentinel + 1 <= L.buf.len)
  47. toCopy = L.buf.len - (L.sentinel + 1)
  48. assert(toCopy >= 0)
  49. if toCopy > 0:
  50. when defined(js) or defined(nimscript):
  51. # nimscript has to be here to avoid compiling other branch (moveMem)
  52. for i in 0 ..< toCopy:
  53. L.buf[i] = L.buf[L.sentinel + 1 + i]
  54. else:
  55. when nimvm:
  56. for i in 0 ..< toCopy:
  57. L.buf[i] = L.buf[L.sentinel + 1 + i]
  58. else:
  59. # "moveMem" handles overlapping regions
  60. moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy)
  61. charsRead = L.input.readDataStr(L.buf, toCopy ..< toCopy + L.sentinel + 1)
  62. s = toCopy + charsRead
  63. if charsRead < L.sentinel + 1:
  64. L.buf[s] = EndOfFile # set end marker
  65. L.sentinel = s
  66. else:
  67. # compute sentinel:
  68. dec(s) # BUGFIX (valgrind)
  69. while true:
  70. assert(s < L.buf.len)
  71. while s >= 0 and L.buf[s] notin L.refillChars: dec(s)
  72. if s >= 0:
  73. # we found an appropriate character for a sentinel:
  74. L.sentinel = s
  75. break
  76. else:
  77. # rather than to give up here because the line is too long,
  78. # double the buffer's size and try again:
  79. oldBufLen = L.buf.len
  80. L.buf.setLen(L.buf.len * 2)
  81. charsRead = readDataStr(L.input, L.buf, oldBufLen ..< L.buf.len)
  82. if charsRead < oldBufLen:
  83. L.buf[oldBufLen + charsRead] = EndOfFile
  84. L.sentinel = oldBufLen + charsRead
  85. break
  86. s = L.buf.len - 1
  87. proc fillBaseLexer(L: var BaseLexer, pos: int): int =
  88. assert(pos <= L.sentinel)
  89. if pos < L.sentinel:
  90. result = pos + 1 # nothing to do
  91. else:
  92. fillBuffer(L)
  93. L.offsetBase += pos
  94. L.bufpos = 0
  95. result = 0
  96. proc handleCR*(L: var BaseLexer, pos: int): int =
  97. ## Call this if you scanned over `'\c'` in the buffer; it returns the
  98. ## position to continue the scanning from. `pos` must be the position
  99. ## of the `'\c'`.
  100. assert(L.buf[pos] == '\c')
  101. inc(L.lineNumber)
  102. result = fillBaseLexer(L, pos)
  103. if L.buf[result] == '\L':
  104. result = fillBaseLexer(L, result)
  105. L.lineStart = result
  106. proc handleLF*(L: var BaseLexer, pos: int): int =
  107. ## Call this if you scanned over `'\L'` in the buffer; it returns the
  108. ## position to continue the scanning from. `pos` must be the position
  109. ## of the `'\L'`.
  110. assert(L.buf[pos] == '\L')
  111. inc(L.lineNumber)
  112. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  113. L.lineStart = result
  114. proc handleRefillChar*(L: var BaseLexer, pos: int): int =
  115. ## Call this if a terminator character other than a new line is scanned
  116. ## at `pos`; it returns the position to continue the scanning from.
  117. assert(L.buf[pos] in L.refillChars)
  118. result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
  119. proc skipUtf8Bom(L: var BaseLexer) =
  120. if (L.buf[0] == '\xEF') and (L.buf[1] == '\xBB') and (L.buf[2] == '\xBF'):
  121. inc(L.bufpos, 3)
  122. inc(L.lineStart, 3)
  123. proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192;
  124. refillChars: set[char] = NewLines) =
  125. ## inits the BaseLexer with a stream to read from.
  126. assert(bufLen > 0)
  127. assert(input != nil)
  128. L.input = input
  129. L.bufpos = 0
  130. L.offsetBase = 0
  131. L.refillChars = refillChars
  132. L.buf = newString(bufLen)
  133. L.sentinel = bufLen - 1
  134. L.lineStart = 0
  135. L.lineNumber = 1 # lines start at 1
  136. fillBuffer(L)
  137. skipUtf8Bom(L)
  138. proc getColNumber*(L: BaseLexer, pos: int): int =
  139. ## retrieves the current column.
  140. result = abs(pos - L.lineStart)
  141. proc getCurrentLine*(L: BaseLexer, marker: bool = true): string =
  142. ## retrieves the current line.
  143. var i: int
  144. result = ""
  145. i = L.lineStart
  146. while not (L.buf[i] in {'\c', '\L', EndOfFile}):
  147. add(result, L.buf[i])
  148. inc(i)
  149. add(result, "\n")
  150. if marker:
  151. add(result, spaces(getColNumber(L, L.bufpos)) & "^\n")