lexer.nim 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310
  1. #
  2. #
  3. # The Nim Compiler
  4. # (c) Copyright 2015 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. # This scanner is handwritten for efficiency. I used an elegant buffering
  10. # scheme which I have not seen anywhere else:
  11. # We guarantee that a whole line is in the buffer. Thus only when scanning
  12. # the \n or \r character we have to check whether we need to read in the next
  13. # chunk. (\n or \r already need special handling for incrementing the line
  14. # counter; choosing both \n and \r allows the scanner to properly read Unix,
  15. # DOS or Macintosh text files, even when it is not the native format.
  16. import
  17. hashes, options, msgs, strutils, platform, idents, nimlexbase, llstream,
  18. wordrecg, lineinfos, pathutils, parseutils
  19. const
  20. MaxLineLength* = 80 # lines longer than this lead to a warning
  21. numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'}
  22. SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  23. SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
  24. OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
  25. '|', '=', '%', '&', '$', '@', '~', ':'}
  26. # don't forget to update the 'highlite' module if these charsets should change
  27. type
  28. TokType* = enum
  29. tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here!
  30. tkSymbol = "tkSymbol", # keywords:
  31. tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm",
  32. tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast",
  33. tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter",
  34. tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do",
  35. tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export",
  36. tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func",
  37. tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface",
  38. tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator",
  39. tkLet = "let",
  40. tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin",
  41. tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out",
  42. tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return",
  43. tkShl = "shl", tkShr = "shr", tkStatic = "static",
  44. tkTemplate = "template",
  45. tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using",
  46. tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor",
  47. tkYield = "yield", # end of keywords
  48. tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit",
  49. tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit",
  50. tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit",
  51. tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit",
  52. tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
  53. tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
  54. tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
  55. tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
  56. tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
  57. tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
  58. tkBracketDotLe = "[.", tkBracketDotRi = ".]",
  59. tkCurlyDotLe = "{.", tkCurlyDotRi = ".}",
  60. tkParDotLe = "(.", tkParDotRi = ".)",
  61. tkComma = ",", tkSemiColon = ";",
  62. tkColon = ":", tkColonColon = "::", tkEquals = "=",
  63. tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:",
  64. tkOpr, tkComment, tkAccent = "`",
  65. # these are fake tokens used by renderer.nim
  66. tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr
  67. TokTypes* = set[TokType]
  68. const
  69. weakTokens = {tkComma, tkSemiColon, tkColon,
  70. tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
  71. tkCurlyRi} # \
  72. # tokens that should not be considered for previousToken
  73. tokKeywordLow* = succ(tkSymbol)
  74. tokKeywordHigh* = pred(tkIntLit)
  75. type
  76. NumericalBase* = enum
  77. base10, # base10 is listed as the first element,
  78. # so that it is the correct default value
  79. base2, base8, base16
  80. Token* = object # a Nim token
  81. tokType*: TokType # the type of the token
  82. indent*: int # the indentation; != -1 if the token has been
  83. # preceded with indentation
  84. ident*: PIdent # the parsed identifier
  85. iNumber*: BiggestInt # the parsed integer literal
  86. fNumber*: BiggestFloat # the parsed floating point literal
  87. base*: NumericalBase # the numerical base; only valid for int
  88. # or float literals
  89. strongSpaceA*: int8 # leading spaces of an operator
  90. strongSpaceB*: int8 # trailing spaces of an operator
  91. literal*: string # the parsed (string) literal; and
  92. # documentation comments are here too
  93. line*, col*: int
  94. when defined(nimpretty):
  95. offsetA*, offsetB*: int # used for pretty printing so that literals
  96. # like 0b01 or r"\L" are unaffected
  97. commentOffsetA*, commentOffsetB*: int
  98. ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string)
  99. Lexer* = object of TBaseLexer
  100. fileIdx*: FileIndex
  101. indentAhead*: int # if > 0 an indentation has already been read
  102. # this is needed because scanning comments
  103. # needs so much look-ahead
  104. currLineIndent*: int
  105. strongSpaces*, allowTabs*: bool
  106. errorHandler*: ErrorHandler
  107. cache*: IdentCache
  108. when defined(nimsuggest):
  109. previousToken: TLineInfo
  110. config*: ConfigRef
  111. proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
  112. result = newLineInfo(L.fileIdx, tok.line, tok.col)
  113. when defined(nimpretty):
  114. result.offsetA = tok.offsetA
  115. result.offsetB = tok.offsetB
  116. result.commentOffsetA = tok.commentOffsetA
  117. result.commentOffsetB = tok.commentOffsetB
  118. proc isKeyword*(kind: TokType): bool =
  119. (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
  120. template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion
  121. proc isNimIdentifier*(s: string): bool =
  122. let sLen = s.len
  123. if sLen > 0 and s[0] in SymStartChars:
  124. var i = 1
  125. while i < sLen:
  126. if s[i] == '_': inc(i)
  127. if i < sLen and s[i] notin SymChars: return
  128. inc(i)
  129. result = true
  130. proc `$`*(tok: Token): string =
  131. case tok.tokType
  132. of tkIntLit..tkInt64Lit: $tok.iNumber
  133. of tkFloatLit..tkFloat64Lit: $tok.fNumber
  134. of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal
  135. of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType
  136. else:
  137. if tok.ident != nil:
  138. tok.ident.s
  139. else:
  140. ""
  141. proc prettyTok*(tok: Token): string =
  142. if isKeyword(tok.tokType): "keyword " & tok.ident.s
  143. else: $tok
  144. proc printTok*(conf: ConfigRef; tok: Token) =
  145. msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok)
  146. proc initToken*(L: var Token) =
  147. L.tokType = tkInvalid
  148. L.iNumber = 0
  149. L.indent = 0
  150. L.strongSpaceA = 0
  151. L.literal = ""
  152. L.fNumber = 0.0
  153. L.base = base10
  154. L.ident = nil
  155. when defined(nimpretty):
  156. L.commentOffsetA = 0
  157. L.commentOffsetB = 0
  158. proc fillToken(L: var Token) =
  159. L.tokType = tkInvalid
  160. L.iNumber = 0
  161. L.indent = 0
  162. L.strongSpaceA = 0
  163. setLen(L.literal, 0)
  164. L.fNumber = 0.0
  165. L.base = base10
  166. L.ident = nil
  167. when defined(nimpretty):
  168. L.commentOffsetA = 0
  169. L.commentOffsetB = 0
  170. proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream;
  171. cache: IdentCache; config: ConfigRef) =
  172. openBaseLexer(lex, inputstream)
  173. lex.fileIdx = fileIdx
  174. lex.indentAhead = -1
  175. lex.currLineIndent = 0
  176. inc(lex.lineNumber, inputstream.lineOffset)
  177. lex.cache = cache
  178. when defined(nimsuggest):
  179. lex.previousToken.fileIndex = fileIdx
  180. lex.config = config
  181. proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream;
  182. cache: IdentCache; config: ConfigRef) =
  183. openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config)
  184. proc closeLexer*(lex: var Lexer) =
  185. if lex.config != nil:
  186. inc(lex.config.linesCompiled, lex.lineNumber)
  187. closeBaseLexer(lex)
  188. proc getLineInfo(L: Lexer): TLineInfo =
  189. result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos))
  190. proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) =
  191. if L.errorHandler.isNil:
  192. msgs.message(L.config, info, msg, arg)
  193. else:
  194. L.errorHandler(L.config, info, msg, arg)
  195. proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") =
  196. L.dispMessage(getLineInfo(L), msg, arg)
  197. proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") =
  198. var info = newLineInfo(L.fileIdx, tok.line, tok.col)
  199. L.dispMessage(info, msg, arg)
  200. proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") =
  201. var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart)
  202. L.dispMessage(info, msg, arg)
  203. proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool =
  204. result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second)
  205. template tokenBegin(tok, pos) {.dirty.} =
  206. when defined(nimsuggest):
  207. var colA = getColNumber(L, pos)
  208. when defined(nimpretty):
  209. tok.offsetA = L.offsetBase + pos
  210. template tokenEnd(tok, pos) {.dirty.} =
  211. when defined(nimsuggest):
  212. let colB = getColNumber(L, pos)+1
  213. if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
  214. L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
  215. L.config.m.trackPos.col = colA.int16
  216. colA = 0
  217. when defined(nimpretty):
  218. tok.offsetB = L.offsetBase + pos
  219. template tokenEndIgnore(tok, pos) =
  220. when defined(nimsuggest):
  221. let colB = getColNumber(L, pos)
  222. if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
  223. L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
  224. L.config.m.trackPos.fileIndex = trackPosInvalidFileIdx
  225. L.config.m.trackPos.line = 0'u16
  226. colA = 0
  227. when defined(nimpretty):
  228. tok.offsetB = L.offsetBase + pos
  229. template tokenEndPrevious(tok, pos) =
  230. when defined(nimsuggest):
  231. # when we detect the cursor in whitespace, we attach the track position
  232. # to the token that came before that, but only if we haven't detected
  233. # the cursor in a string literal or comment:
  234. let colB = getColNumber(L, pos)
  235. if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
  236. L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
  237. L.config.m.trackPos = L.previousToken
  238. L.config.m.trackPosAttached = true
  239. colA = 0
  240. when defined(nimpretty):
  241. tok.offsetB = L.offsetBase + pos
  242. template eatChar(L: var Lexer, t: var Token, replacementChar: char) =
  243. t.literal.add(replacementChar)
  244. inc(L.bufpos)
  245. template eatChar(L: var Lexer, t: var Token) =
  246. t.literal.add(L.buf[L.bufpos])
  247. inc(L.bufpos)
  248. proc getNumber(L: var Lexer, result: var Token) =
  249. proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural =
  250. var pos = L.bufpos # use registers for pos, buf
  251. result = 0
  252. while true:
  253. if L.buf[pos] in chars:
  254. tok.literal.add(L.buf[pos])
  255. inc(pos)
  256. inc(result)
  257. else:
  258. break
  259. if L.buf[pos] == '_':
  260. if L.buf[pos+1] notin chars:
  261. lexMessage(L, errGenerated,
  262. "only single underscores may occur in a token and token may not " &
  263. "end with an underscore: e.g. '1__1' and '1_' are invalid")
  264. break
  265. tok.literal.add('_')
  266. inc(pos)
  267. L.bufpos = pos
  268. proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) =
  269. var pos = L.bufpos # use registers for pos, buf
  270. while L.buf[pos] in chars:
  271. tok.literal.add(L.buf[pos])
  272. inc(pos)
  273. L.bufpos = pos
  274. proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) =
  275. # Used to get slightly human friendlier err messages.
  276. const literalishChars = {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'O',
  277. 'c', 'C', 'b', 'B', '_', '.', '\'', 'd', 'i', 'u'}
  278. var msgPos = L.bufpos
  279. var t: Token
  280. t.literal = ""
  281. L.bufpos = startpos # Use L.bufpos as pos because of matchChars
  282. matchChars(L, t, literalishChars)
  283. # We must verify +/- specifically so that we're not past the literal
  284. if L.buf[L.bufpos] in {'+', '-'} and
  285. L.buf[L.bufpos - 1] in {'e', 'E'}:
  286. t.literal.add(L.buf[L.bufpos])
  287. inc(L.bufpos)
  288. matchChars(L, t, literalishChars)
  289. if L.buf[L.bufpos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
  290. inc(L.bufpos)
  291. t.literal.add(L.buf[L.bufpos])
  292. matchChars(L, t, {'0'..'9'})
  293. L.bufpos = msgPos
  294. lexMessage(L, msgKind, msg % t.literal)
  295. var
  296. startpos, endpos: int
  297. xi: BiggestInt
  298. isBase10 = true
  299. numDigits = 0
  300. const
  301. # 'c', 'C' is deprecated
  302. baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'}
  303. literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''}
  304. floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit}
  305. result.tokType = tkIntLit # int literal until we know better
  306. result.literal = ""
  307. result.base = base10
  308. startpos = L.bufpos
  309. tokenBegin(result, startpos)
  310. # First stage: find out base, make verifications, build token literal string
  311. # {'c', 'C'} is added for deprecation reasons to provide a clear error message
  312. if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
  313. isBase10 = false
  314. eatChar(L, result, '0')
  315. case L.buf[L.bufpos]
  316. of 'c', 'C':
  317. lexMessageLitNum(L,
  318. "$1 will soon be invalid for oct literals; Use '0o' " &
  319. "for octals. 'c', 'C' prefix",
  320. startpos,
  321. warnDeprecated)
  322. eatChar(L, result, 'c')
  323. numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
  324. of 'O':
  325. lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " &
  326. "use the '0o' prefix.", startpos)
  327. of 'x', 'X':
  328. eatChar(L, result, 'x')
  329. numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'})
  330. of 'o':
  331. eatChar(L, result, 'o')
  332. numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
  333. of 'b', 'B':
  334. eatChar(L, result, 'b')
  335. numDigits = matchUnderscoreChars(L, result, {'0'..'1'})
  336. else:
  337. internalError(L.config, getLineInfo(L), "getNumber")
  338. if numDigits == 0:
  339. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  340. else:
  341. discard matchUnderscoreChars(L, result, {'0'..'9'})
  342. if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
  343. result.tokType = tkFloatLit
  344. eatChar(L, result, '.')
  345. discard matchUnderscoreChars(L, result, {'0'..'9'})
  346. if L.buf[L.bufpos] in {'e', 'E'}:
  347. result.tokType = tkFloatLit
  348. eatChar(L, result, 'e')
  349. if L.buf[L.bufpos] in {'+', '-'}:
  350. eatChar(L, result)
  351. discard matchUnderscoreChars(L, result, {'0'..'9'})
  352. endpos = L.bufpos
  353. # Second stage, find out if there's a datatype suffix and handle it
  354. var postPos = endpos
  355. if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
  356. if L.buf[postPos] == '\'':
  357. inc(postPos)
  358. case L.buf[postPos]
  359. of 'f', 'F':
  360. inc(postPos)
  361. if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
  362. result.tokType = tkFloat32Lit
  363. inc(postPos, 2)
  364. elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
  365. result.tokType = tkFloat64Lit
  366. inc(postPos, 2)
  367. elif (L.buf[postPos] == '1') and
  368. (L.buf[postPos + 1] == '2') and
  369. (L.buf[postPos + 2] == '8'):
  370. result.tokType = tkFloat128Lit
  371. inc(postPos, 3)
  372. else: # "f" alone defaults to float32
  373. result.tokType = tkFloat32Lit
  374. of 'd', 'D': # ad hoc convenience shortcut for f64
  375. inc(postPos)
  376. result.tokType = tkFloat64Lit
  377. of 'i', 'I':
  378. inc(postPos)
  379. if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
  380. result.tokType = tkInt64Lit
  381. inc(postPos, 2)
  382. elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
  383. result.tokType = tkInt32Lit
  384. inc(postPos, 2)
  385. elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
  386. result.tokType = tkInt16Lit
  387. inc(postPos, 2)
  388. elif (L.buf[postPos] == '8'):
  389. result.tokType = tkInt8Lit
  390. inc(postPos)
  391. else:
  392. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  393. of 'u', 'U':
  394. inc(postPos)
  395. if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
  396. result.tokType = tkUInt64Lit
  397. inc(postPos, 2)
  398. elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
  399. result.tokType = tkUInt32Lit
  400. inc(postPos, 2)
  401. elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
  402. result.tokType = tkUInt16Lit
  403. inc(postPos, 2)
  404. elif (L.buf[postPos] == '8'):
  405. result.tokType = tkUInt8Lit
  406. inc(postPos)
  407. else:
  408. result.tokType = tkUIntLit
  409. else:
  410. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  411. # Is there still a literalish char awaiting? Then it's an error!
  412. if L.buf[postPos] in literalishChars or
  413. (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
  414. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  415. # Third stage, extract actual number
  416. L.bufpos = startpos # restore position
  417. var pos: int = startpos
  418. try:
  419. if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
  420. inc(pos, 2)
  421. xi = 0 # it is a base prefix
  422. case L.buf[pos - 1]
  423. of 'b', 'B':
  424. result.base = base2
  425. while pos < endpos:
  426. if L.buf[pos] != '_':
  427. xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
  428. inc(pos)
  429. # 'c', 'C' is deprecated
  430. of 'o', 'c', 'C':
  431. result.base = base8
  432. while pos < endpos:
  433. if L.buf[pos] != '_':
  434. xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
  435. inc(pos)
  436. of 'x', 'X':
  437. result.base = base16
  438. while pos < endpos:
  439. case L.buf[pos]
  440. of '_':
  441. inc(pos)
  442. of '0'..'9':
  443. xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
  444. inc(pos)
  445. of 'a'..'f':
  446. xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
  447. inc(pos)
  448. of 'A'..'F':
  449. xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
  450. inc(pos)
  451. else:
  452. break
  453. else:
  454. internalError(L.config, getLineInfo(L), "getNumber")
  455. case result.tokType
  456. of tkIntLit, tkInt64Lit: result.iNumber = xi
  457. of tkInt8Lit: result.iNumber = ashr(xi shl 56, 56)
  458. of tkInt16Lit: result.iNumber = ashr(xi shl 48, 48)
  459. of tkInt32Lit: result.iNumber = ashr(xi shl 32, 32)
  460. of tkUIntLit, tkUInt64Lit: result.iNumber = xi
  461. of tkUInt8Lit: result.iNumber = xi and 0xff
  462. of tkUInt16Lit: result.iNumber = xi and 0xffff
  463. of tkUInt32Lit: result.iNumber = xi and 0xffffffff
  464. of tkFloat32Lit:
  465. result.fNumber = (cast[PFloat32](addr(xi)))[]
  466. # note: this code is endian neutral!
  467. # XXX: Test this on big endian machine!
  468. of tkFloat64Lit, tkFloatLit:
  469. result.fNumber = (cast[PFloat64](addr(xi)))[]
  470. else: internalError(L.config, getLineInfo(L), "getNumber")
  471. # Bounds checks. Non decimal literals are allowed to overflow the range of
  472. # the datatype as long as their pattern don't overflow _bitwise_, hence
  473. # below checks of signed sizes against uint*.high is deliberate:
  474. # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK)
  475. if result.tokType notin floatTypes:
  476. let outOfRange = case result.tokType:
  477. of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi
  478. of tkInt8Lit: (xi > BiggestInt(uint8.high))
  479. of tkInt16Lit: (xi > BiggestInt(uint16.high))
  480. of tkInt32Lit: (xi > BiggestInt(uint32.high))
  481. else: false
  482. if outOfRange:
  483. #echo "out of range num: ", result.iNumber, " vs ", xi
  484. lexMessageLitNum(L, "number out of range: '$1'", startpos)
  485. else:
  486. case result.tokType
  487. of floatTypes:
  488. result.fNumber = parseFloat(result.literal)
  489. of tkUInt64Lit, tkUIntLit:
  490. var iNumber: uint64
  491. var len: int
  492. try:
  493. len = parseBiggestUInt(result.literal, iNumber)
  494. except ValueError:
  495. raise newException(OverflowDefect, "number out of range: " & $result.literal)
  496. if len != result.literal.len:
  497. raise newException(ValueError, "invalid integer: " & $result.literal)
  498. result.iNumber = cast[int64](iNumber)
  499. else:
  500. var iNumber: int64
  501. var len: int
  502. try:
  503. len = parseBiggestInt(result.literal, iNumber)
  504. except ValueError:
  505. raise newException(OverflowDefect, "number out of range: " & $result.literal)
  506. if len != result.literal.len:
  507. raise newException(ValueError, "invalid integer: " & $result.literal)
  508. result.iNumber = iNumber
  509. # Explicit bounds checks. Only T.high needs to be considered
  510. # since result.iNumber can't be negative.
  511. let outOfRange =
  512. case result.tokType
  513. of tkInt8Lit: result.iNumber > int8.high
  514. of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high)
  515. of tkInt16Lit: result.iNumber > int16.high
  516. of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high)
  517. of tkInt32Lit: result.iNumber > int32.high
  518. of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high)
  519. else: false
  520. if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos)
  521. # Promote int literal to int64? Not always necessary, but more consistent
  522. if result.tokType == tkIntLit:
  523. if result.iNumber > high(int32):
  524. result.tokType = tkInt64Lit
  525. except ValueError:
  526. lexMessageLitNum(L, "invalid number: '$1'", startpos)
  527. except OverflowDefect, RangeDefect:
  528. lexMessageLitNum(L, "number out of range: '$1'", startpos)
  529. tokenEnd(result, postPos-1)
  530. L.bufpos = postPos
  531. proc handleHexChar(L: var Lexer, xi: var int; position: range[0..4]) =
  532. template invalid() =
  533. lexMessage(L, errGenerated,
  534. "expected a hex digit, but found: " & L.buf[L.bufpos] &
  535. "; maybe prepend with 0")
  536. case L.buf[L.bufpos]
  537. of '0'..'9':
  538. xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'))
  539. inc(L.bufpos)
  540. of 'a'..'f':
  541. xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10)
  542. inc(L.bufpos)
  543. of 'A'..'F':
  544. xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
  545. inc(L.bufpos)
  546. of '"', '\'':
  547. if position <= 1: invalid()
  548. # do not progress the bufpos here.
  549. if position == 0: inc(L.bufpos)
  550. else:
  551. invalid()
  552. # Need to progress for `nim check`
  553. inc(L.bufpos)
  554. proc handleDecChars(L: var Lexer, xi: var int) =
  555. while L.buf[L.bufpos] in {'0'..'9'}:
  556. xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
  557. inc(L.bufpos)
  558. proc addUnicodeCodePoint(s: var string, i: int) =
  559. let i = cast[uint](i)
  560. # inlined toUTF-8 to avoid unicode and strutils dependencies.
  561. let pos = s.len
  562. if i <= 127:
  563. s.setLen(pos+1)
  564. s[pos+0] = chr(i)
  565. elif i <= 0x07FF:
  566. s.setLen(pos+2)
  567. s[pos+0] = chr((i shr 6) or 0b110_00000)
  568. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  569. elif i <= 0xFFFF:
  570. s.setLen(pos+3)
  571. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  572. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  573. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  574. elif i <= 0x001FFFFF:
  575. s.setLen(pos+4)
  576. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  577. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  578. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  579. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  580. elif i <= 0x03FFFFFF:
  581. s.setLen(pos+5)
  582. s[pos+0] = chr(i shr 24 or 0b111110_00)
  583. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  584. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  585. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  586. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  587. elif i <= 0x7FFFFFFF:
  588. s.setLen(pos+6)
  589. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  590. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  591. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  592. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  593. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  594. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  595. proc getEscapedChar(L: var Lexer, tok: var Token) =
  596. inc(L.bufpos) # skip '\'
  597. case L.buf[L.bufpos]
  598. of 'n', 'N':
  599. tok.literal.add('\L')
  600. inc(L.bufpos)
  601. of 'p', 'P':
  602. if tok.tokType == tkCharLit:
  603. lexMessage(L, errGenerated, "\\p not allowed in character literal")
  604. tok.literal.add(L.config.target.tnl)
  605. inc(L.bufpos)
  606. of 'r', 'R', 'c', 'C':
  607. tok.literal.add(CR)
  608. inc(L.bufpos)
  609. of 'l', 'L':
  610. tok.literal.add(LF)
  611. inc(L.bufpos)
  612. of 'f', 'F':
  613. tok.literal.add(FF)
  614. inc(L.bufpos)
  615. of 'e', 'E':
  616. tok.literal.add(ESC)
  617. inc(L.bufpos)
  618. of 'a', 'A':
  619. tok.literal.add(BEL)
  620. inc(L.bufpos)
  621. of 'b', 'B':
  622. tok.literal.add(BACKSPACE)
  623. inc(L.bufpos)
  624. of 'v', 'V':
  625. tok.literal.add(VT)
  626. inc(L.bufpos)
  627. of 't', 'T':
  628. tok.literal.add('\t')
  629. inc(L.bufpos)
  630. of '\'', '\"':
  631. tok.literal.add(L.buf[L.bufpos])
  632. inc(L.bufpos)
  633. of '\\':
  634. tok.literal.add('\\')
  635. inc(L.bufpos)
  636. of 'x', 'X':
  637. inc(L.bufpos)
  638. var xi = 0
  639. handleHexChar(L, xi, 1)
  640. handleHexChar(L, xi, 2)
  641. tok.literal.add(chr(xi))
  642. of 'u', 'U':
  643. if tok.tokType == tkCharLit:
  644. lexMessage(L, errGenerated, "\\u not allowed in character literal")
  645. inc(L.bufpos)
  646. var xi = 0
  647. if L.buf[L.bufpos] == '{':
  648. inc(L.bufpos)
  649. var start = L.bufpos
  650. while L.buf[L.bufpos] != '}':
  651. handleHexChar(L, xi, 0)
  652. if start == L.bufpos:
  653. lexMessage(L, errGenerated,
  654. "Unicode codepoint cannot be empty")
  655. inc(L.bufpos)
  656. if xi > 0x10FFFF:
  657. let hex = ($L.buf)[start..L.bufpos-2]
  658. lexMessage(L, errGenerated,
  659. "Unicode codepoint must be lower than 0x10FFFF, but was: " & hex)
  660. else:
  661. handleHexChar(L, xi, 1)
  662. handleHexChar(L, xi, 2)
  663. handleHexChar(L, xi, 3)
  664. handleHexChar(L, xi, 4)
  665. addUnicodeCodePoint(tok.literal, xi)
  666. of '0'..'9':
  667. if matchTwoChars(L, '0', {'0'..'9'}):
  668. lexMessage(L, warnOctalEscape)
  669. var xi = 0
  670. handleDecChars(L, xi)
  671. if (xi <= 255): tok.literal.add(chr(xi))
  672. else: lexMessage(L, errGenerated, "invalid character constant")
  673. else: lexMessage(L, errGenerated, "invalid character constant")
  674. proc handleCRLF(L: var Lexer, pos: int): int =
  675. template registerLine =
  676. let col = L.getColNumber(pos)
  677. when not defined(nimpretty):
  678. if col > MaxLineLength:
  679. lexMessagePos(L, hintLineTooLong, pos)
  680. case L.buf[pos]
  681. of CR:
  682. registerLine()
  683. result = nimlexbase.handleCR(L, pos)
  684. of LF:
  685. registerLine()
  686. result = nimlexbase.handleLF(L, pos)
  687. else: result = pos
  688. type
  689. StringMode = enum
  690. normal,
  691. raw,
  692. generalized
  693. proc getString(L: var Lexer, tok: var Token, mode: StringMode) =
  694. var pos = L.bufpos
  695. var line = L.lineNumber # save linenumber for better error message
  696. tokenBegin(tok, pos - ord(mode == raw))
  697. inc pos # skip "
  698. if L.buf[pos] == '\"' and L.buf[pos+1] == '\"':
  699. tok.tokType = tkTripleStrLit # long string literal:
  700. inc(pos, 2) # skip ""
  701. # skip leading newline:
  702. if L.buf[pos] in {' ', '\t'}:
  703. var newpos = pos+1
  704. while L.buf[newpos] in {' ', '\t'}: inc newpos
  705. if L.buf[newpos] in {CR, LF}: pos = newpos
  706. pos = handleCRLF(L, pos)
  707. while true:
  708. case L.buf[pos]
  709. of '\"':
  710. if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and
  711. L.buf[pos+3] != '\"':
  712. tokenEndIgnore(tok, pos+2)
  713. L.bufpos = pos + 3 # skip the three """
  714. break
  715. tok.literal.add('\"')
  716. inc(pos)
  717. of CR, LF:
  718. tokenEndIgnore(tok, pos)
  719. pos = handleCRLF(L, pos)
  720. tok.literal.add("\n")
  721. of nimlexbase.EndOfFile:
  722. tokenEndIgnore(tok, pos)
  723. var line2 = L.lineNumber
  724. L.lineNumber = line
  725. lexMessagePos(L, errGenerated, L.lineStart, "closing \"\"\" expected, but end of file reached")
  726. L.lineNumber = line2
  727. L.bufpos = pos
  728. break
  729. else:
  730. tok.literal.add(L.buf[pos])
  731. inc(pos)
  732. else:
  733. # ordinary string literal
  734. if mode != normal: tok.tokType = tkRStrLit
  735. else: tok.tokType = tkStrLit
  736. while true:
  737. var c = L.buf[pos]
  738. if c == '\"':
  739. if mode != normal and L.buf[pos+1] == '\"':
  740. inc(pos, 2)
  741. tok.literal.add('"')
  742. else:
  743. tokenEndIgnore(tok, pos)
  744. inc(pos) # skip '"'
  745. break
  746. elif c in {CR, LF, nimlexbase.EndOfFile}:
  747. tokenEndIgnore(tok, pos)
  748. lexMessage(L, errGenerated, "closing \" expected")
  749. break
  750. elif (c == '\\') and mode == normal:
  751. L.bufpos = pos
  752. getEscapedChar(L, tok)
  753. pos = L.bufpos
  754. else:
  755. tok.literal.add(c)
  756. inc(pos)
  757. L.bufpos = pos
  758. proc getCharacter(L: var Lexer, tok: var Token) =
  759. tokenBegin(tok, L.bufpos)
  760. inc(L.bufpos) # skip '
  761. var c = L.buf[L.bufpos]
  762. case c
  763. of '\0'..pred(' '), '\'':
  764. lexMessage(L, errGenerated, "invalid character literal")
  765. tok.literal = $c
  766. of '\\': getEscapedChar(L, tok)
  767. else:
  768. tok.literal = $c
  769. inc(L.bufpos)
  770. if L.buf[L.bufpos] != '\'':
  771. lexMessage(L, errGenerated, "missing closing ' for character literal")
  772. tokenEndIgnore(tok, L.bufpos)
  773. inc(L.bufpos) # skip '
  774. proc getSymbol(L: var Lexer, tok: var Token) =
  775. var h: Hash = 0
  776. var pos = L.bufpos
  777. tokenBegin(tok, pos)
  778. var suspicious = false
  779. while true:
  780. var c = L.buf[pos]
  781. case c
  782. of 'a'..'z', '0'..'9', '\x80'..'\xFF':
  783. h = h !& ord(c)
  784. inc(pos)
  785. of 'A'..'Z':
  786. c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
  787. h = h !& ord(c)
  788. inc(pos)
  789. suspicious = true
  790. of '_':
  791. if L.buf[pos+1] notin SymChars:
  792. lexMessage(L, errGenerated, "invalid token: trailing underscore")
  793. break
  794. inc(pos)
  795. suspicious = true
  796. else: break
  797. tokenEnd(tok, pos-1)
  798. h = !$h
  799. tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
  800. if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
  801. (tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
  802. tok.tokType = tkSymbol
  803. else:
  804. tok.tokType = TokType(tok.ident.id + ord(tkSymbol))
  805. if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}:
  806. lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s)
  807. L.bufpos = pos
  808. proc endOperator(L: var Lexer, tok: var Token, pos: int,
  809. hash: Hash) {.inline.} =
  810. var h = !$hash
  811. tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
  812. if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
  813. else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
  814. L.bufpos = pos
  815. proc getOperator(L: var Lexer, tok: var Token) =
  816. var pos = L.bufpos
  817. tokenBegin(tok, pos)
  818. var h: Hash = 0
  819. while true:
  820. var c = L.buf[pos]
  821. if c notin OpChars: break
  822. h = h !& ord(c)
  823. inc(pos)
  824. endOperator(L, tok, pos, h)
  825. tokenEnd(tok, pos-1)
  826. # advance pos but don't store it in L.bufpos so the next token (which might
  827. # be an operator too) gets the preceding spaces:
  828. tok.strongSpaceB = 0
  829. while L.buf[pos] == ' ':
  830. inc pos
  831. inc tok.strongSpaceB
  832. if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}:
  833. tok.strongSpaceB = -1
  834. proc getPrecedence*(tok: Token): int =
  835. ## Calculates the precedence of the given token.
  836. case tok.tokType
  837. of tkOpr:
  838. let relevantChar = tok.ident.s[0]
  839. # arrow like?
  840. if tok.ident.s.len > 1 and tok.ident.s[^1] == '>' and
  841. tok.ident.s[^2] in {'-', '~', '='}: return 1
  842. template considerAsgn(value: untyped) =
  843. result = if tok.ident.s[^1] == '=': 1 else: value
  844. case relevantChar
  845. of '$', '^': considerAsgn(10)
  846. of '*', '%', '/', '\\': considerAsgn(9)
  847. of '~': result = 8
  848. of '+', '-', '|': considerAsgn(8)
  849. of '&': considerAsgn(7)
  850. of '=', '<', '>', '!': result = 5
  851. of '.': considerAsgn(6)
  852. of '?': result = 2
  853. else: considerAsgn(2)
  854. of tkDiv, tkMod, tkShl, tkShr: result = 9
  855. of tkDotDot: result = 6
  856. of tkIn, tkNotin, tkIs, tkIsnot, tkOf, tkAs, tkFrom: result = 5
  857. of tkAnd: result = 4
  858. of tkOr, tkXor, tkPtr, tkRef: result = 3
  859. else: return -10
  860. proc newlineFollows*(L: Lexer): bool =
  861. var pos = L.bufpos
  862. while true:
  863. case L.buf[pos]
  864. of ' ', '\t':
  865. inc(pos)
  866. of CR, LF:
  867. result = true
  868. break
  869. of '#':
  870. inc(pos)
  871. if L.buf[pos] == '#': inc(pos)
  872. if L.buf[pos] != '[': return true
  873. else:
  874. break
  875. proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int;
  876. isDoc: bool) =
  877. var pos = start
  878. var toStrip = 0
  879. tokenBegin(tok, pos)
  880. # detect the amount of indentation:
  881. if isDoc:
  882. toStrip = getColNumber(L, pos)
  883. while L.buf[pos] == ' ': inc pos
  884. if L.buf[pos] in {CR, LF}:
  885. pos = handleCRLF(L, pos)
  886. toStrip = 0
  887. while L.buf[pos] == ' ':
  888. inc pos
  889. inc toStrip
  890. var nesting = 0
  891. while true:
  892. case L.buf[pos]
  893. of '#':
  894. if isDoc:
  895. if L.buf[pos+1] == '#' and L.buf[pos+2] == '[':
  896. inc nesting
  897. tok.literal.add '#'
  898. elif L.buf[pos+1] == '[':
  899. inc nesting
  900. inc pos
  901. of ']':
  902. if isDoc:
  903. if L.buf[pos+1] == '#' and L.buf[pos+2] == '#':
  904. if nesting == 0:
  905. tokenEndIgnore(tok, pos+2)
  906. inc(pos, 3)
  907. break
  908. dec nesting
  909. tok.literal.add ']'
  910. elif L.buf[pos+1] == '#':
  911. if nesting == 0:
  912. tokenEndIgnore(tok, pos+1)
  913. inc(pos, 2)
  914. break
  915. dec nesting
  916. inc pos
  917. of CR, LF:
  918. tokenEndIgnore(tok, pos)
  919. pos = handleCRLF(L, pos)
  920. # strip leading whitespace:
  921. when defined(nimpretty): tok.literal.add "\L"
  922. if isDoc:
  923. when not defined(nimpretty): tok.literal.add "\n"
  924. inc tok.iNumber
  925. var c = toStrip
  926. while L.buf[pos] == ' ' and c > 0:
  927. inc pos
  928. dec c
  929. of nimlexbase.EndOfFile:
  930. tokenEndIgnore(tok, pos)
  931. lexMessagePos(L, errGenerated, pos, "end of multiline comment expected")
  932. break
  933. else:
  934. if isDoc or defined(nimpretty): tok.literal.add L.buf[pos]
  935. inc(pos)
  936. L.bufpos = pos
  937. when defined(nimpretty):
  938. tok.commentOffsetB = L.offsetBase + pos - 1
  939. proc scanComment(L: var Lexer, tok: var Token) =
  940. var pos = L.bufpos
  941. tok.tokType = tkComment
  942. # iNumber contains the number of '\n' in the token
  943. tok.iNumber = 0
  944. assert L.buf[pos+1] == '#'
  945. when defined(nimpretty):
  946. tok.commentOffsetA = L.offsetBase + pos
  947. if L.buf[pos+2] == '[':
  948. skipMultiLineComment(L, tok, pos+3, true)
  949. return
  950. tokenBegin(tok, pos)
  951. inc(pos, 2)
  952. var toStrip = 0
  953. while L.buf[pos] == ' ':
  954. inc pos
  955. inc toStrip
  956. while true:
  957. var lastBackslash = -1
  958. while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
  959. if L.buf[pos] == '\\': lastBackslash = pos+1
  960. tok.literal.add(L.buf[pos])
  961. inc(pos)
  962. tokenEndIgnore(tok, pos)
  963. pos = handleCRLF(L, pos)
  964. var indent = 0
  965. while L.buf[pos] == ' ':
  966. inc(pos)
  967. inc(indent)
  968. if L.buf[pos] == '#' and L.buf[pos+1] == '#':
  969. tok.literal.add "\n"
  970. inc(pos, 2)
  971. var c = toStrip
  972. while L.buf[pos] == ' ' and c > 0:
  973. inc pos
  974. dec c
  975. inc tok.iNumber
  976. else:
  977. if L.buf[pos] > ' ':
  978. L.indentAhead = indent
  979. tokenEndIgnore(tok, pos)
  980. break
  981. L.bufpos = pos
  982. when defined(nimpretty):
  983. tok.commentOffsetB = L.offsetBase + pos - 1
  984. proc skip(L: var Lexer, tok: var Token) =
  985. var pos = L.bufpos
  986. tokenBegin(tok, pos)
  987. tok.strongSpaceA = 0
  988. when defined(nimpretty):
  989. var hasComment = false
  990. var commentIndent = L.currLineIndent
  991. tok.commentOffsetA = L.offsetBase + pos
  992. tok.commentOffsetB = tok.commentOffsetA
  993. tok.line = -1
  994. while true:
  995. case L.buf[pos]
  996. of ' ':
  997. inc(pos)
  998. inc(tok.strongSpaceA)
  999. of '\t':
  1000. if not L.allowTabs: lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead")
  1001. inc(pos)
  1002. of CR, LF:
  1003. tokenEndPrevious(tok, pos)
  1004. pos = handleCRLF(L, pos)
  1005. var indent = 0
  1006. while true:
  1007. if L.buf[pos] == ' ':
  1008. inc(pos)
  1009. inc(indent)
  1010. elif L.buf[pos] == '#' and L.buf[pos+1] == '[':
  1011. when defined(nimpretty):
  1012. hasComment = true
  1013. if tok.line < 0:
  1014. tok.line = L.lineNumber
  1015. commentIndent = indent
  1016. skipMultiLineComment(L, tok, pos+2, false)
  1017. pos = L.bufpos
  1018. else:
  1019. break
  1020. tok.strongSpaceA = 0
  1021. when defined(nimpretty):
  1022. if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent
  1023. if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'):
  1024. tok.indent = indent
  1025. L.currLineIndent = indent
  1026. break
  1027. of '#':
  1028. # do not skip documentation comment:
  1029. if L.buf[pos+1] == '#': break
  1030. when defined(nimpretty):
  1031. hasComment = true
  1032. if tok.line < 0:
  1033. tok.line = L.lineNumber
  1034. if L.buf[pos+1] == '[':
  1035. skipMultiLineComment(L, tok, pos+2, false)
  1036. pos = L.bufpos
  1037. else:
  1038. tokenBegin(tok, pos)
  1039. while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
  1040. when defined(nimpretty): tok.literal.add L.buf[pos]
  1041. inc(pos)
  1042. tokenEndIgnore(tok, pos+1)
  1043. when defined(nimpretty):
  1044. tok.commentOffsetB = L.offsetBase + pos + 1
  1045. else:
  1046. break # EndOfFile also leaves the loop
  1047. tokenEndPrevious(tok, pos-1)
  1048. L.bufpos = pos
  1049. when defined(nimpretty):
  1050. if hasComment:
  1051. tok.commentOffsetB = L.offsetBase + pos - 1
  1052. tok.tokType = tkComment
  1053. tok.indent = commentIndent
  1054. proc rawGetTok*(L: var Lexer, tok: var Token) =
  1055. template atTokenEnd() {.dirty.} =
  1056. when defined(nimsuggest):
  1057. # we attach the cursor to the last *strong* token
  1058. if tok.tokType notin weakTokens:
  1059. L.previousToken.line = tok.line.uint16
  1060. L.previousToken.col = tok.col.int16
  1061. fillToken(tok)
  1062. if L.indentAhead >= 0:
  1063. tok.indent = L.indentAhead
  1064. L.currLineIndent = L.indentAhead
  1065. L.indentAhead = -1
  1066. else:
  1067. tok.indent = -1
  1068. skip(L, tok)
  1069. when defined(nimpretty):
  1070. if tok.tokType == tkComment:
  1071. L.indentAhead = L.currLineIndent
  1072. return
  1073. var c = L.buf[L.bufpos]
  1074. tok.line = L.lineNumber
  1075. tok.col = getColNumber(L, L.bufpos)
  1076. if c in SymStartChars - {'r', 'R'}:
  1077. getSymbol(L, tok)
  1078. else:
  1079. case c
  1080. of '#':
  1081. scanComment(L, tok)
  1082. of '*':
  1083. # '*:' is unfortunately a special case, because it is two tokens in
  1084. # 'var v*: int'.
  1085. if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars:
  1086. var h = 0 !& ord('*')
  1087. endOperator(L, tok, L.bufpos+1, h)
  1088. else:
  1089. getOperator(L, tok)
  1090. of ',':
  1091. tok.tokType = tkComma
  1092. inc(L.bufpos)
  1093. of 'r', 'R':
  1094. if L.buf[L.bufpos + 1] == '\"':
  1095. inc(L.bufpos)
  1096. getString(L, tok, raw)
  1097. else:
  1098. getSymbol(L, tok)
  1099. of '(':
  1100. inc(L.bufpos)
  1101. if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
  1102. tok.tokType = tkParDotLe
  1103. inc(L.bufpos)
  1104. else:
  1105. tok.tokType = tkParLe
  1106. when defined(nimsuggest):
  1107. if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col < L.config.m.trackPos.col and
  1108. tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideCon:
  1109. L.config.m.trackPos.col = tok.col.int16
  1110. of ')':
  1111. tok.tokType = tkParRi
  1112. inc(L.bufpos)
  1113. of '[':
  1114. inc(L.bufpos)
  1115. if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
  1116. tok.tokType = tkBracketDotLe
  1117. inc(L.bufpos)
  1118. elif L.buf[L.bufpos] == ':':
  1119. tok.tokType = tkBracketLeColon
  1120. inc(L.bufpos)
  1121. else:
  1122. tok.tokType = tkBracketLe
  1123. of ']':
  1124. tok.tokType = tkBracketRi
  1125. inc(L.bufpos)
  1126. of '.':
  1127. when defined(nimsuggest):
  1128. if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col+1 == L.config.m.trackPos.col and
  1129. tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideSug:
  1130. tok.tokType = tkDot
  1131. L.config.m.trackPos.col = tok.col.int16
  1132. inc(L.bufpos)
  1133. atTokenEnd()
  1134. return
  1135. if L.buf[L.bufpos+1] == ']':
  1136. tok.tokType = tkBracketDotRi
  1137. inc(L.bufpos, 2)
  1138. elif L.buf[L.bufpos+1] == '}':
  1139. tok.tokType = tkCurlyDotRi
  1140. inc(L.bufpos, 2)
  1141. elif L.buf[L.bufpos+1] == ')':
  1142. tok.tokType = tkParDotRi
  1143. inc(L.bufpos, 2)
  1144. else:
  1145. getOperator(L, tok)
  1146. of '{':
  1147. inc(L.bufpos)
  1148. if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
  1149. tok.tokType = tkCurlyDotLe
  1150. inc(L.bufpos)
  1151. else:
  1152. tok.tokType = tkCurlyLe
  1153. of '}':
  1154. tok.tokType = tkCurlyRi
  1155. inc(L.bufpos)
  1156. of ';':
  1157. tok.tokType = tkSemiColon
  1158. inc(L.bufpos)
  1159. of '`':
  1160. tok.tokType = tkAccent
  1161. inc(L.bufpos)
  1162. of '_':
  1163. inc(L.bufpos)
  1164. if L.buf[L.bufpos] notin SymChars+{'_'}:
  1165. tok.tokType = tkSymbol
  1166. tok.ident = L.cache.getIdent("_")
  1167. else:
  1168. tok.literal = $c
  1169. tok.tokType = tkInvalid
  1170. lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')')
  1171. of '\"':
  1172. # check for generalized raw string literal:
  1173. let mode = if L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars: generalized else: normal
  1174. getString(L, tok, mode)
  1175. if mode == generalized:
  1176. # tkRStrLit -> tkGStrLit
  1177. # tkTripleStrLit -> tkGTripleStrLit
  1178. inc(tok.tokType, 2)
  1179. of '\'':
  1180. tok.tokType = tkCharLit
  1181. getCharacter(L, tok)
  1182. tok.tokType = tkCharLit
  1183. of '0'..'9':
  1184. getNumber(L, tok)
  1185. let c = L.buf[L.bufpos]
  1186. if c in SymChars+{'_'}:
  1187. lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
  1188. else:
  1189. if c in OpChars:
  1190. getOperator(L, tok)
  1191. elif c == nimlexbase.EndOfFile:
  1192. tok.tokType = tkEof
  1193. tok.indent = 0
  1194. else:
  1195. tok.literal = $c
  1196. tok.tokType = tkInvalid
  1197. lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')')
  1198. inc(L.bufpos)
  1199. atTokenEnd()
  1200. proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream;
  1201. cache: IdentCache; config: ConfigRef): int =
  1202. var lex: Lexer
  1203. var tok: Token
  1204. initToken(tok)
  1205. openLexer(lex, fileIdx, inputstream, cache, config)
  1206. var prevToken = tkEof
  1207. while tok.tokType != tkEof:
  1208. rawGetTok(lex, tok)
  1209. if tok.indent > 0 and prevToken in {tkColon, tkEquals, tkType, tkConst, tkLet, tkVar, tkUsing}:
  1210. result = tok.indent
  1211. if result > 0: break
  1212. prevToken = tok.tokType
  1213. closeLexer(lex)
  1214. proc getPrecedence*(ident: PIdent): int =
  1215. ## assumes ident is binary operator already
  1216. var tok: Token
  1217. initToken(tok)
  1218. tok.ident = ident
  1219. tok.tokType =
  1220. if tok.ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol):
  1221. TokType(tok.ident.id + ord(tkSymbol))
  1222. else: tkOpr
  1223. getPrecedence(tok)