highlite.nim 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Source highlighter for programming or markup languages.
  10. ## Currently only few languages are supported, other languages may be added.
  11. ## The interface supports one language nested in another.
  12. ##
  13. ## You can use this to build your own syntax highlighting, check this example:
  14. ##
  15. ## ```Nim
  16. ## let code = """for x in $int.high: echo x.ord mod 2 == 0"""
  17. ## var toknizr: GeneralTokenizer
  18. ## initGeneralTokenizer(toknizr, code)
  19. ## while true:
  20. ## getNextToken(toknizr, langNim)
  21. ## case toknizr.kind
  22. ## of gtEof: break # End Of File (or string)
  23. ## of gtWhitespace:
  24. ## echo gtWhitespace # Maybe you want "visible" whitespaces?.
  25. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  26. ## of gtOperator:
  27. ## echo gtOperator # Maybe you want Operators to use a specific color?.
  28. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  29. ## # of gtSomeSymbol: syntaxHighlight("Comic Sans", "bold", "99px", "pink")
  30. ## else:
  31. ## echo toknizr.kind # All the kinds of tokens can be processed here.
  32. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  33. ## ```
  34. ##
  35. ## The proc `getSourceLanguage` can get the language `enum` from a string:
  36. ## ```Nim
  37. ## for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
  38. ## ```
  39. ##
  40. ## There is also a `Cmd` pseudo-language supported, which is a simple generic
  41. ## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command):
  42. ## no escaping, no programming language constructs besides variable definition
  43. ## at the beginning of line. It supports these operators:
  44. ## ```Cmd
  45. ## & && | || ( ) '' "" ; # for comments
  46. ## ```
  47. ##
  48. ## Instead of escaping always use quotes like here
  49. ## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``.
  50. ## Any argument that contains ``.`` or ``/`` or ``\`` will be treated
  51. ## as a file or directory.
  52. ##
  53. ## In addition to `Cmd` there is also `Console` language for
  54. ## displaying interactive sessions.
  55. ## Lines with a command should start with ``$``, other lines are considered
  56. ## as program output.
  57. import
  58. strutils
  59. from algorithm import binarySearch
  60. when defined(nimPreviewSlimSystem):
  61. import std/[assertions, syncio]
  62. type
  63. SourceLanguage* = enum
  64. langNone, langNim, langCpp, langCsharp, langC, langJava,
  65. langYaml, langPython, langCmd, langConsole
  66. TokenClass* = enum
  67. gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
  68. gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
  69. gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
  70. gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
  71. gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
  72. gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
  73. gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther
  74. GeneralTokenizer* = object of RootObj
  75. kind*: TokenClass
  76. start*, length*: int
  77. buf: cstring
  78. pos: int
  79. state: TokenClass
  80. lang: SourceLanguage
  81. const
  82. sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
  83. "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
  84. sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none",
  85. "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
  86. ## list of languages spelled with alpabetic characters
  87. tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
  88. "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
  89. "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
  90. "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
  91. "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
  92. "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
  93. "Label", "Reference", "Prompt", "ProgramOutput",
  94. # start from lower-case if there is a corresponding RST role (see rst.nim)
  95. "program", "option",
  96. "Other"]
  97. # The following list comes from doc/keywords.txt, make sure it is
  98. # synchronized with this array by running the module itself as a test case.
  99. nimKeywords = ["addr", "and", "as", "asm", "bind", "block",
  100. "break", "case", "cast", "concept", "const", "continue", "converter",
  101. "defer", "discard", "distinct", "div", "do",
  102. "elif", "else", "end", "enum", "except", "export",
  103. "finally", "for", "from", "func",
  104. "if", "import", "in", "include",
  105. "interface", "is", "isnot", "iterator", "let", "macro", "method",
  106. "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
  107. "ptr", "raise", "ref", "return", "shl", "shr", "static",
  108. "template", "try", "tuple", "type", "using", "var", "when", "while",
  109. "xor", "yield"]
  110. proc getSourceLanguage*(name: string): SourceLanguage =
  111. for i in succ(low(SourceLanguage)) .. high(SourceLanguage):
  112. if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
  113. return i
  114. if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0:
  115. return i
  116. result = langNone
  117. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
  118. g.buf = buf
  119. g.kind = low(TokenClass)
  120. g.start = 0
  121. g.length = 0
  122. g.state = low(TokenClass)
  123. g.lang = low(SourceLanguage)
  124. g.pos = 0
  125. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
  126. initGeneralTokenizer(g, cstring(buf))
  127. proc deinitGeneralTokenizer*(g: var GeneralTokenizer) =
  128. discard
  129. proc nimGetKeyword(id: string): TokenClass =
  130. for k in nimKeywords:
  131. if cmpIgnoreStyle(id, k) == 0: return gtKeyword
  132. result = gtIdentifier
  133. when false:
  134. var i = getIdent(id)
  135. if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
  136. (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)):
  137. result = gtKeyword
  138. else:
  139. result = gtIdentifier
  140. proc nimNumberPostfix(g: var GeneralTokenizer, position: int): int =
  141. var pos = position
  142. if g.buf[pos] == '\'':
  143. inc(pos)
  144. case g.buf[pos]
  145. of 'f', 'F':
  146. g.kind = gtFloatNumber
  147. inc(pos)
  148. if g.buf[pos] in {'0'..'9'}: inc(pos)
  149. if g.buf[pos] in {'0'..'9'}: inc(pos)
  150. of 'i', 'I':
  151. inc(pos)
  152. if g.buf[pos] in {'0'..'9'}: inc(pos)
  153. if g.buf[pos] in {'0'..'9'}: inc(pos)
  154. else:
  155. discard
  156. result = pos
  157. proc nimNumber(g: var GeneralTokenizer, position: int): int =
  158. const decChars = {'0'..'9', '_'}
  159. var pos = position
  160. g.kind = gtDecNumber
  161. while g.buf[pos] in decChars: inc(pos)
  162. if g.buf[pos] == '.':
  163. g.kind = gtFloatNumber
  164. inc(pos)
  165. while g.buf[pos] in decChars: inc(pos)
  166. if g.buf[pos] in {'e', 'E'}:
  167. g.kind = gtFloatNumber
  168. inc(pos)
  169. if g.buf[pos] in {'+', '-'}: inc(pos)
  170. while g.buf[pos] in decChars: inc(pos)
  171. result = nimNumberPostfix(g, pos)
  172. const
  173. OpChars = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
  174. '|', '=', '%', '&', '$', '@', '~', ':'}
  175. proc isKeyword(x: openArray[string], y: string): int =
  176. binarySearch(x, y)
  177. proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
  178. const
  179. hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
  180. octChars = {'0'..'7', '_'}
  181. binChars = {'0'..'1', '_'}
  182. SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  183. var pos = g.pos
  184. g.start = g.pos
  185. if g.state == gtStringLit:
  186. if g.buf[pos] == '\\':
  187. g.kind = gtEscapeSequence
  188. inc(pos)
  189. case g.buf[pos]
  190. of 'x', 'X':
  191. inc(pos)
  192. if g.buf[pos] in hexChars: inc(pos)
  193. if g.buf[pos] in hexChars: inc(pos)
  194. of '0'..'9':
  195. while g.buf[pos] in {'0'..'9'}: inc(pos)
  196. of '\0':
  197. g.state = gtNone
  198. else: inc(pos)
  199. else:
  200. g.kind = gtStringLit
  201. while true:
  202. case g.buf[pos]
  203. of '\\':
  204. break
  205. of '\0', '\r', '\n':
  206. g.state = gtNone
  207. break
  208. of '\"':
  209. inc(pos)
  210. g.state = gtNone
  211. break
  212. else: inc(pos)
  213. else:
  214. case g.buf[pos]
  215. of ' ', '\t'..'\r':
  216. g.kind = gtWhitespace
  217. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  218. of '#':
  219. g.kind = gtComment
  220. inc(pos)
  221. var isDoc = false
  222. if g.buf[pos] == '#':
  223. inc(pos)
  224. isDoc = true
  225. if g.buf[pos] == '[' and g.lang == langNim:
  226. g.kind = gtLongComment
  227. var nesting = 0
  228. while true:
  229. case g.buf[pos]
  230. of '\0': break
  231. of '#':
  232. if isDoc:
  233. if g.buf[pos+1] == '#' and g.buf[pos+2] == '[':
  234. inc nesting
  235. elif g.buf[pos+1] == '[':
  236. inc nesting
  237. inc pos
  238. of ']':
  239. if isDoc:
  240. if g.buf[pos+1] == '#' and g.buf[pos+2] == '#':
  241. if nesting == 0:
  242. inc(pos, 3)
  243. break
  244. dec nesting
  245. elif g.buf[pos+1] == '#':
  246. if nesting == 0:
  247. inc(pos, 2)
  248. break
  249. dec nesting
  250. inc pos
  251. else:
  252. inc pos
  253. else:
  254. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  255. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  256. var id = ""
  257. while g.buf[pos] in SymChars + {'_'}:
  258. add(id, g.buf[pos])
  259. inc(pos)
  260. if (g.buf[pos] == '\"'):
  261. if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'):
  262. inc(pos, 3)
  263. g.kind = gtLongStringLit
  264. while true:
  265. case g.buf[pos]
  266. of '\0':
  267. break
  268. of '\"':
  269. inc(pos)
  270. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  271. g.buf[pos+2] != '\"':
  272. inc(pos, 2)
  273. break
  274. else: inc(pos)
  275. else:
  276. g.kind = gtRawData
  277. inc(pos)
  278. while not (g.buf[pos] in {'\0', '\n', '\r'}):
  279. if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
  280. inc(pos)
  281. if g.buf[pos] == '\"': inc(pos)
  282. else:
  283. if g.lang == langNim:
  284. g.kind = nimGetKeyword(id)
  285. elif isKeyword(keywords, id) >= 0:
  286. g.kind = gtKeyword
  287. of '0':
  288. inc(pos)
  289. case g.buf[pos]
  290. of 'b', 'B':
  291. g.kind = gtBinNumber
  292. inc(pos)
  293. while g.buf[pos] in binChars: inc(pos)
  294. pos = nimNumberPostfix(g, pos)
  295. of 'x', 'X':
  296. g.kind = gtHexNumber
  297. inc(pos)
  298. while g.buf[pos] in hexChars: inc(pos)
  299. pos = nimNumberPostfix(g, pos)
  300. of 'o', 'O':
  301. g.kind = gtOctNumber
  302. inc(pos)
  303. while g.buf[pos] in octChars: inc(pos)
  304. pos = nimNumberPostfix(g, pos)
  305. else: pos = nimNumber(g, pos)
  306. of '1'..'9':
  307. pos = nimNumber(g, pos)
  308. of '\'':
  309. inc(pos)
  310. g.kind = gtCharLit
  311. while true:
  312. case g.buf[pos]
  313. of '\0', '\r', '\n':
  314. break
  315. of '\'':
  316. inc(pos)
  317. break
  318. of '\\':
  319. inc(pos, 2)
  320. else: inc(pos)
  321. of '\"':
  322. inc(pos)
  323. if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
  324. inc(pos, 2)
  325. g.kind = gtLongStringLit
  326. while true:
  327. case g.buf[pos]
  328. of '\0':
  329. break
  330. of '\"':
  331. inc(pos)
  332. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  333. g.buf[pos+2] != '\"':
  334. inc(pos, 2)
  335. break
  336. else: inc(pos)
  337. else:
  338. g.kind = gtStringLit
  339. while true:
  340. case g.buf[pos]
  341. of '\0', '\r', '\n':
  342. break
  343. of '\"':
  344. inc(pos)
  345. break
  346. of '\\':
  347. g.state = g.kind
  348. break
  349. else: inc(pos)
  350. of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';':
  351. inc(pos)
  352. g.kind = gtPunctuation
  353. of '\0':
  354. g.kind = gtEof
  355. else:
  356. if g.buf[pos] in OpChars:
  357. g.kind = gtOperator
  358. while g.buf[pos] in OpChars: inc(pos)
  359. else:
  360. inc(pos)
  361. g.kind = gtNone
  362. g.length = pos - g.pos
  363. if g.kind != gtEof and g.state != gtNone and g.length <= 0:
  364. assert false, "nimNextToken: produced an empty token"
  365. g.pos = pos
  366. proc generalNumber(g: var GeneralTokenizer, position: int): int =
  367. const decChars = {'0'..'9'}
  368. var pos = position
  369. g.kind = gtDecNumber
  370. while g.buf[pos] in decChars: inc(pos)
  371. if g.buf[pos] == '.':
  372. g.kind = gtFloatNumber
  373. inc(pos)
  374. while g.buf[pos] in decChars: inc(pos)
  375. if g.buf[pos] in {'e', 'E'}:
  376. g.kind = gtFloatNumber
  377. inc(pos)
  378. if g.buf[pos] in {'+', '-'}: inc(pos)
  379. while g.buf[pos] in decChars: inc(pos)
  380. result = pos
  381. proc generalStrLit(g: var GeneralTokenizer, position: int): int =
  382. const
  383. decChars = {'0'..'9'}
  384. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  385. var pos = position
  386. g.kind = gtStringLit
  387. var c = g.buf[pos]
  388. inc(pos) # skip " or '
  389. while true:
  390. case g.buf[pos]
  391. of '\0':
  392. break
  393. of '\\':
  394. inc(pos)
  395. case g.buf[pos]
  396. of '\0':
  397. break
  398. of '0'..'9':
  399. while g.buf[pos] in decChars: inc(pos)
  400. of 'x', 'X':
  401. inc(pos)
  402. if g.buf[pos] in hexChars: inc(pos)
  403. if g.buf[pos] in hexChars: inc(pos)
  404. else: inc(pos, 2)
  405. else:
  406. if g.buf[pos] == c:
  407. inc(pos)
  408. break
  409. else:
  410. inc(pos)
  411. result = pos
  412. type
  413. TokenizerFlag = enum
  414. hasPreprocessor, hasNestedComments
  415. TokenizerFlags = set[TokenizerFlag]
  416. proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
  417. flags: TokenizerFlags) =
  418. const
  419. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  420. octChars = {'0'..'7'}
  421. binChars = {'0'..'1'}
  422. symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
  423. var pos = g.pos
  424. g.start = g.pos
  425. if g.state == gtStringLit:
  426. g.kind = gtStringLit
  427. while true:
  428. case g.buf[pos]
  429. of '\\':
  430. g.kind = gtEscapeSequence
  431. inc(pos)
  432. case g.buf[pos]
  433. of 'x', 'X':
  434. inc(pos)
  435. if g.buf[pos] in hexChars: inc(pos)
  436. if g.buf[pos] in hexChars: inc(pos)
  437. of '0'..'9':
  438. while g.buf[pos] in {'0'..'9'}: inc(pos)
  439. of '\0':
  440. g.state = gtNone
  441. else: inc(pos)
  442. break
  443. of '\0', '\r', '\n':
  444. g.state = gtNone
  445. break
  446. of '\"':
  447. inc(pos)
  448. g.state = gtNone
  449. break
  450. else: inc(pos)
  451. else:
  452. case g.buf[pos]
  453. of ' ', '\t'..'\r':
  454. g.kind = gtWhitespace
  455. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  456. of '/':
  457. inc(pos)
  458. if g.buf[pos] == '/':
  459. g.kind = gtComment
  460. while not (g.buf[pos] in {'\0', '\n', '\r'}): inc(pos)
  461. elif g.buf[pos] == '*':
  462. g.kind = gtLongComment
  463. var nested = 0
  464. inc(pos)
  465. while true:
  466. case g.buf[pos]
  467. of '*':
  468. inc(pos)
  469. if g.buf[pos] == '/':
  470. inc(pos)
  471. if nested == 0: break
  472. of '/':
  473. inc(pos)
  474. if g.buf[pos] == '*':
  475. inc(pos)
  476. if hasNestedComments in flags: inc(nested)
  477. of '\0':
  478. break
  479. else: inc(pos)
  480. else:
  481. g.kind = gtOperator
  482. while g.buf[pos] in OpChars: inc(pos)
  483. of '#':
  484. inc(pos)
  485. if hasPreprocessor in flags:
  486. g.kind = gtPreprocessor
  487. while g.buf[pos] in {' ', '\t'}: inc(pos)
  488. while g.buf[pos] in symChars: inc(pos)
  489. else:
  490. g.kind = gtOperator
  491. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  492. var id = ""
  493. while g.buf[pos] in symChars:
  494. add(id, g.buf[pos])
  495. inc(pos)
  496. if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
  497. else: g.kind = gtIdentifier
  498. of '0':
  499. inc(pos)
  500. case g.buf[pos]
  501. of 'b', 'B':
  502. inc(pos)
  503. while g.buf[pos] in binChars: inc(pos)
  504. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  505. of 'x', 'X':
  506. inc(pos)
  507. while g.buf[pos] in hexChars: inc(pos)
  508. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  509. of '0'..'7':
  510. inc(pos)
  511. while g.buf[pos] in octChars: inc(pos)
  512. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  513. else:
  514. pos = generalNumber(g, pos)
  515. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  516. of '1'..'9':
  517. pos = generalNumber(g, pos)
  518. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  519. of '\'':
  520. pos = generalStrLit(g, pos)
  521. g.kind = gtCharLit
  522. of '\"':
  523. inc(pos)
  524. g.kind = gtStringLit
  525. while true:
  526. case g.buf[pos]
  527. of '\0':
  528. break
  529. of '\"':
  530. inc(pos)
  531. break
  532. of '\\':
  533. g.state = g.kind
  534. break
  535. else: inc(pos)
  536. of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.':
  537. inc(pos)
  538. g.kind = gtPunctuation
  539. of '\0':
  540. g.kind = gtEof
  541. else:
  542. if g.buf[pos] in OpChars:
  543. g.kind = gtOperator
  544. while g.buf[pos] in OpChars: inc(pos)
  545. else:
  546. inc(pos)
  547. g.kind = gtNone
  548. g.length = pos - g.pos
  549. if g.kind != gtEof and g.length <= 0:
  550. assert false, "clikeNextToken: produced an empty token"
  551. g.pos = pos
  552. proc cNextToken(g: var GeneralTokenizer) =
  553. const
  554. keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto",
  555. "break", "case", "char", "const", "continue", "default", "do", "double",
  556. "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int",
  557. "long", "register", "restrict", "return", "short", "signed", "sizeof",
  558. "static", "struct", "switch", "typedef", "union", "unsigned", "void",
  559. "volatile", "while"]
  560. clikeNextToken(g, keywords, {hasPreprocessor})
  561. proc cppNextToken(g: var GeneralTokenizer) =
  562. const
  563. keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch",
  564. "char", "class", "const", "continue", "default", "delete", "do", "double",
  565. "else", "enum", "extern", "float", "for", "friend", "goto", "if",
  566. "inline", "int", "long", "new", "operator", "private", "protected",
  567. "public", "register", "return", "short", "signed", "sizeof", "static",
  568. "struct", "switch", "template", "this", "throw", "try", "typedef",
  569. "union", "unsigned", "virtual", "void", "volatile", "while"]
  570. clikeNextToken(g, keywords, {hasPreprocessor})
  571. proc csharpNextToken(g: var GeneralTokenizer) =
  572. const
  573. keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break",
  574. "byte", "case", "catch", "char", "checked", "class", "const", "continue",
  575. "decimal", "default", "delegate", "do", "double", "else", "enum", "event",
  576. "explicit", "extern", "false", "finally", "fixed", "float", "for",
  577. "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal",
  578. "is", "lock", "long", "namespace", "new", "null", "object", "operator",
  579. "out", "override", "params", "private", "protected", "public", "readonly",
  580. "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc",
  581. "static", "string", "struct", "switch", "this", "throw", "true", "try",
  582. "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using",
  583. "virtual", "void", "volatile", "while"]
  584. clikeNextToken(g, keywords, {hasPreprocessor})
  585. proc javaNextToken(g: var GeneralTokenizer) =
  586. const
  587. keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break",
  588. "byte", "case", "catch", "char", "class", "const", "continue", "default",
  589. "do", "double", "else", "enum", "extends", "false", "final", "finally",
  590. "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
  591. "interface", "long", "native", "new", "null", "package", "private",
  592. "protected", "public", "return", "short", "static", "strictfp", "super",
  593. "switch", "synchronized", "this", "throw", "throws", "transient", "true",
  594. "try", "void", "volatile", "while"]
  595. clikeNextToken(g, keywords, {})
  596. proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
  597. g.kind = gtStringLit
  598. while g.buf[pos] notin {'\0', '\t'..'\r', ',', ']', '}'}:
  599. if g.buf[pos] == ':' and
  600. g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
  601. break
  602. inc(pos)
  603. proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
  604. g.kind = gtNone
  605. if g.buf[pos] == '-': inc(pos)
  606. if g.buf[pos] == '0': inc(pos)
  607. elif g.buf[pos] in '1'..'9':
  608. inc(pos)
  609. while g.buf[pos] in {'0'..'9'}: inc(pos)
  610. else: yamlPlainStrLit(g, pos)
  611. if g.kind == gtNone:
  612. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  613. g.kind = gtDecNumber
  614. elif g.buf[pos] == '.':
  615. inc(pos)
  616. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  617. else:
  618. while g.buf[pos] in {'0'..'9'}: inc(pos)
  619. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  620. g.kind = gtFloatNumber
  621. if g.kind == gtNone:
  622. if g.buf[pos] in {'e', 'E'}:
  623. inc(pos)
  624. if g.buf[pos] in {'-', '+'}: inc(pos)
  625. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  626. else:
  627. while g.buf[pos] in {'0'..'9'}: inc(pos)
  628. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  629. g.kind = gtFloatNumber
  630. else: yamlPlainStrLit(g, pos)
  631. else: yamlPlainStrLit(g, pos)
  632. while g.buf[pos] notin {'\0', ',', ']', '}', '\n', '\r'}:
  633. inc(pos)
  634. if g.buf[pos] notin {'\t'..'\r', ' ', ',', ']', '}'}:
  635. yamlPlainStrLit(g, pos)
  636. break
  637. # theoretically, we would need to parse indentation (like with block scalars)
  638. # because of possible multiline flow scalars that start with number-like
  639. # content, but that is far too troublesome. I think it is fine that the
  640. # highlighter is sloppy here.
  641. proc yamlNextToken(g: var GeneralTokenizer) =
  642. const
  643. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  644. var pos = g.pos
  645. g.start = g.pos
  646. if g.state == gtStringLit:
  647. g.kind = gtStringLit
  648. while true:
  649. case g.buf[pos]
  650. of '\\':
  651. if pos != g.pos: break
  652. g.kind = gtEscapeSequence
  653. inc(pos)
  654. case g.buf[pos]
  655. of 'x':
  656. inc(pos)
  657. for i in 1..2:
  658. if g.buf[pos] in hexChars: inc(pos)
  659. break
  660. of 'u':
  661. inc(pos)
  662. for i in 1..4:
  663. if g.buf[pos] in hexChars: inc(pos)
  664. break
  665. of 'U':
  666. inc(pos)
  667. for i in 1..8:
  668. if g.buf[pos] in hexChars: inc(pos)
  669. break
  670. else: inc(pos)
  671. break
  672. of '\0':
  673. g.state = gtOther
  674. break
  675. of '\"':
  676. inc(pos)
  677. g.state = gtOther
  678. break
  679. else: inc(pos)
  680. elif g.state == gtCharLit:
  681. # abusing gtCharLit as single-quoted string lit
  682. g.kind = gtStringLit
  683. inc(pos) # skip the starting '
  684. while true:
  685. case g.buf[pos]
  686. of '\'':
  687. inc(pos)
  688. if g.buf[pos] == '\'':
  689. inc(pos)
  690. g.kind = gtEscapeSequence
  691. else: g.state = gtOther
  692. break
  693. else: inc(pos)
  694. elif g.state == gtCommand:
  695. # gtCommand means 'block scalar header'
  696. case g.buf[pos]
  697. of ' ', '\t':
  698. g.kind = gtWhitespace
  699. while g.buf[pos] in {' ', '\t'}: inc(pos)
  700. of '#':
  701. g.kind = gtComment
  702. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  703. of '\n', '\r': discard
  704. else:
  705. # illegal here. just don't parse a block scalar
  706. g.kind = gtNone
  707. g.state = gtOther
  708. if g.buf[pos] in {'\n', '\r'} and g.state == gtCommand:
  709. g.state = gtLongStringLit
  710. elif g.state == gtLongStringLit:
  711. # beware, this is the only token where we actually have to parse
  712. # indentation.
  713. g.kind = gtLongStringLit
  714. # first, we have to find the parent indentation of the block scalar, so that
  715. # we know when to stop
  716. assert g.buf[pos] in {'\n', '\r'}
  717. var lookbehind = pos - 1
  718. var headerStart = -1
  719. while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
  720. if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
  721. headerStart = lookbehind
  722. dec(lookbehind)
  723. assert headerStart != -1
  724. var indentation = 1
  725. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  726. if g.buf[lookbehind + indentation] in {'|', '>'}:
  727. # when the header is alone in a line, this line does not show the parent's
  728. # indentation, so we must go further. search the first previous line with
  729. # non-whitespace content.
  730. while lookbehind >= 0 and g.buf[lookbehind] in {'\n', '\r'}:
  731. dec(lookbehind)
  732. while lookbehind >= 0 and
  733. g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
  734. # now, find the beginning of the line...
  735. while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
  736. dec(lookbehind)
  737. # ... and its indentation
  738. indentation = 1
  739. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  740. if lookbehind == -1: indentation = 0 # top level
  741. elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
  742. g.buf[lookbehind + 3] == '-' and
  743. g.buf[lookbehind + 4] in {'\t'..'\r', ' '}:
  744. # this is a document start, therefore, we are at top level
  745. indentation = 0
  746. # because lookbehind was at newline char when calculating indentation, we're
  747. # off by one. fix that. top level's parent will have indentation of -1.
  748. let parentIndentation = indentation - 1
  749. # find first content
  750. while g.buf[pos] in {' ', '\n', '\r'}:
  751. if g.buf[pos] == ' ': inc(indentation)
  752. else: indentation = 0
  753. inc(pos)
  754. var minIndentation = indentation
  755. # for stupid edge cases, we must check whether an explicit indentation depth
  756. # is given at the header.
  757. while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart)
  758. if g.buf[headerStart] in {'0'..'9'}:
  759. minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0'))
  760. # process content lines
  761. while indentation > parentIndentation and g.buf[pos] != '\0':
  762. if (indentation < minIndentation and g.buf[pos] == '#') or
  763. (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
  764. g.buf[pos + 2] == '.' and
  765. g.buf[pos + 3] in {'\0', '\t'..'\r', ' '}):
  766. # comment after end of block scalar, or end of document
  767. break
  768. minIndentation = min(indentation, minIndentation)
  769. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  770. while g.buf[pos] in {' ', '\n', '\r'}:
  771. if g.buf[pos] == ' ': inc(indentation)
  772. else: indentation = 0
  773. inc(pos)
  774. g.state = gtOther
  775. elif g.state == gtOther:
  776. # gtOther means 'inside YAML document'
  777. case g.buf[pos]
  778. of ' ', '\t'..'\r':
  779. g.kind = gtWhitespace
  780. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  781. of '#':
  782. g.kind = gtComment
  783. inc(pos)
  784. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  785. of '-':
  786. inc(pos)
  787. if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
  788. g.kind = gtPunctuation
  789. elif g.buf[pos] == '-' and
  790. (pos == 1 or g.buf[pos - 2] in {'\n', '\r'}): # start of line
  791. inc(pos)
  792. if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
  793. inc(pos)
  794. g.kind = gtKeyword
  795. else: yamlPossibleNumber(g, pos)
  796. else: yamlPossibleNumber(g, pos)
  797. of '.':
  798. if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
  799. inc(pos)
  800. for i in 1..2:
  801. if g.buf[pos] != '.': break
  802. inc(pos)
  803. if pos == g.start + 3:
  804. g.kind = gtKeyword
  805. g.state = gtNone
  806. else: yamlPlainStrLit(g, pos)
  807. else: yamlPlainStrLit(g, pos)
  808. of '?':
  809. inc(pos)
  810. if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
  811. g.kind = gtPunctuation
  812. else: yamlPlainStrLit(g, pos)
  813. of ':':
  814. inc(pos)
  815. if g.buf[pos] in {'\0', '\t'..'\r', ' ', '\'', '\"'} or
  816. (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
  817. g.kind = gtPunctuation
  818. else: yamlPlainStrLit(g, pos)
  819. of '[', ']', '{', '}', ',':
  820. inc(pos)
  821. g.kind = gtPunctuation
  822. of '\"':
  823. inc(pos)
  824. g.state = gtStringLit
  825. g.kind = gtStringLit
  826. of '\'':
  827. g.state = gtCharLit
  828. g.kind = gtNone
  829. of '!':
  830. g.kind = gtTagStart
  831. inc(pos)
  832. if g.buf[pos] == '<':
  833. # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
  834. while g.buf[pos] notin {'\0', '>', '\t'..'\r', ' '}: inc(pos)
  835. if g.buf[pos] == '>': inc(pos)
  836. else:
  837. while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
  838. case g.buf[pos]
  839. of '!':
  840. # prefixed tag (e.g. `!!str`)
  841. inc(pos)
  842. while g.buf[pos] notin
  843. {'\0', '\t'..'\r', ' ', ',', '[', ']', '{', '}'}: inc(pos)
  844. of '\0', '\t'..'\r', ' ': discard
  845. else:
  846. # local tag (e.g. `!nim:system:int`)
  847. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  848. of '&':
  849. g.kind = gtLabel
  850. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  851. of '*':
  852. g.kind = gtReference
  853. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  854. of '|', '>':
  855. # this can lead to incorrect tokenization when | or > appear inside flow
  856. # content. checking whether we're inside flow content is not
  857. # chomsky type-3, so we won't do that here.
  858. g.kind = gtCommand
  859. g.state = gtCommand
  860. inc(pos)
  861. while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos)
  862. of '0'..'9': yamlPossibleNumber(g, pos)
  863. of '\0': g.kind = gtEof
  864. else: yamlPlainStrLit(g, pos)
  865. else:
  866. # outside document
  867. case g.buf[pos]
  868. of '%':
  869. if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
  870. g.kind = gtDirective
  871. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  872. else:
  873. g.state = gtOther
  874. yamlPlainStrLit(g, pos)
  875. of ' ', '\t'..'\r':
  876. g.kind = gtWhitespace
  877. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  878. of '#':
  879. g.kind = gtComment
  880. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  881. of '\0': g.kind = gtEof
  882. else:
  883. g.kind = gtNone
  884. g.state = gtOther
  885. g.length = pos - g.pos
  886. g.pos = pos
  887. proc pythonNextToken(g: var GeneralTokenizer) =
  888. const
  889. keywords: array[0..34, string] = [
  890. "False", "None", "True", "and", "as", "assert", "async", "await",
  891. "break", "class", "continue", "def", "del", "elif", "else", "except",
  892. "finally", "for", "from", "global", "if", "import", "in", "is", "lambda",
  893. "nonlocal", "not", "or", "pass", "raise", "return", "try", "while",
  894. "with", "yield"]
  895. nimNextToken(g, keywords)
  896. proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) =
  897. var pos = g.pos
  898. g.start = g.pos
  899. if g.state == low(TokenClass):
  900. g.state = if dollarPrompt: gtPrompt else: gtProgram
  901. case g.buf[pos]
  902. of ' ', '\t'..'\r':
  903. g.kind = gtWhitespace
  904. while g.buf[pos] in {' ', '\t'..'\r'}:
  905. if g.buf[pos] == '\n':
  906. g.state = if dollarPrompt: gtPrompt else: gtProgram
  907. inc(pos)
  908. of '\'', '"':
  909. g.kind = gtOption
  910. let q = g.buf[pos]
  911. inc(pos)
  912. while g.buf[pos] notin {q, '\0'}:
  913. inc(pos)
  914. if g.buf[pos] == q: inc(pos)
  915. of '#':
  916. g.kind = gtComment
  917. while g.buf[pos] notin {'\n', '\0'}:
  918. inc(pos)
  919. of '&', '|':
  920. g.kind = gtOperator
  921. inc(pos)
  922. if g.buf[pos] == g.buf[pos-1]: inc(pos)
  923. g.state = gtProgram
  924. of '(':
  925. g.kind = gtOperator
  926. g.state = gtProgram
  927. inc(pos)
  928. of ')':
  929. g.kind = gtOperator
  930. inc(pos)
  931. of ';':
  932. g.state = gtProgram
  933. g.kind = gtOperator
  934. inc(pos)
  935. of '\0': g.kind = gtEof
  936. elif dollarPrompt and g.state == gtPrompt:
  937. if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}:
  938. g.kind = gtPrompt
  939. inc pos, 2
  940. g.state = gtProgram
  941. else:
  942. g.kind = gtProgramOutput
  943. while g.buf[pos] notin {'\n', '\0'}:
  944. inc(pos)
  945. else:
  946. if g.state == gtProgram:
  947. g.kind = gtProgram
  948. g.state = gtOption
  949. else:
  950. g.kind = gtOption
  951. while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}:
  952. if g.buf[pos] == ';' and g.buf[pos+1] == ' ':
  953. # (check space because ';' can be used inside arguments in Win bat)
  954. break
  955. if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}:
  956. g.kind = gtIdentifier # for file/dir name
  957. elif g.kind == gtProgram and g.buf[pos] == '=':
  958. g.kind = gtIdentifier # for env variable setting at beginning of line
  959. g.state = gtProgram
  960. inc(pos)
  961. g.length = pos - g.pos
  962. g.pos = pos
  963. proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  964. g.lang = lang
  965. case lang
  966. of langNone: assert false
  967. of langNim: nimNextToken(g)
  968. of langCpp: cppNextToken(g)
  969. of langCsharp: csharpNextToken(g)
  970. of langC: cNextToken(g)
  971. of langJava: javaNextToken(g)
  972. of langYaml: yamlNextToken(g)
  973. of langPython: pythonNextToken(g)
  974. of langCmd: cmdNextToken(g)
  975. of langConsole: cmdNextToken(g, dollarPrompt=true)
  976. proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] =
  977. var g: GeneralTokenizer
  978. initGeneralTokenizer(g, text)
  979. var prevPos = 0
  980. while true:
  981. getNextToken(g, lang)
  982. if g.kind == gtEof:
  983. break
  984. var s = text[prevPos ..< g.pos]
  985. result.add (s, g.kind)
  986. prevPos = g.pos
  987. when isMainModule:
  988. var keywords: seq[string]
  989. # Try to work running in both the subdir or at the root.
  990. for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
  991. try:
  992. let input = readFile(filename)
  993. keywords = input.splitWhitespace()
  994. break
  995. except:
  996. echo filename, " not found"
  997. doAssert(keywords.len > 0, "Couldn't read any keywords.txt file!")
  998. for i in 0..min(keywords.len, nimKeywords.len)-1:
  999. doAssert keywords[i] == nimKeywords[i], "Unexpected keyword"
  1000. doAssert keywords.len == nimKeywords.len, "No matching lengths"