highlite.nim 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Source highlighter for programming or markup languages.
  10. ## Currently only few languages are supported, other languages may be added.
  11. ## The interface supports one language nested in another.
  12. ##
  13. ## You can use this to build your own syntax highlighting, check this example:
  14. ##
  15. ## .. code:: Nim
  16. ## let code = """for x in $int.high: echo x.ord mod 2 == 0"""
  17. ## var toknizr: GeneralTokenizer
  18. ## initGeneralTokenizer(toknizr, code)
  19. ## while true:
  20. ## getNextToken(toknizr, langNim)
  21. ## case toknizr.kind
  22. ## of gtEof: break # End Of File (or string)
  23. ## of gtWhitespace:
  24. ## echo gtWhitespace # Maybe you want "visible" whitespaces?.
  25. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  26. ## of gtOperator:
  27. ## echo gtOperator # Maybe you want Operators to use a specific color?.
  28. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  29. ## # of gtSomeSymbol: syntaxHighlight("Comic Sans", "bold", "99px", "pink")
  30. ## else:
  31. ## echo toknizr.kind # All the kinds of tokens can be processed here.
  32. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  33. ##
  34. ## The proc `getSourceLanguage` can get the language `enum` from a string:
  35. ##
  36. ## .. code:: Nim
  37. ## for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
  38. ##
  39. ## There is also a `Cmd` pseudo-language supported, which is a simple generic
  40. ## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command):
  41. ## no escaping, no programming language constructs besides variable definition
  42. ## at the beginning of line. It supports these operators:
  43. ##
  44. ## .. code:: Cmd
  45. ## & && | || ( ) '' "" ; # for comments
  46. ##
  47. ## Instead of escaping always use quotes like here
  48. ## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``.
  49. ## Any argument that contains ``.`` or ``/`` or ``\`` will be treated
  50. ## as a file or directory.
  51. ##
  52. ## In addition to `Cmd` there is also `Console` language for
  53. ## displaying interactive sessions.
  54. ## Lines with a command should start with ``$``, other lines are considered
  55. ## as program output.
  56. import
  57. strutils
  58. from algorithm import binarySearch
  59. when defined(nimPreviewSlimSystem):
  60. import std/assertions
  61. type
  62. SourceLanguage* = enum
  63. langNone, langNim, langCpp, langCsharp, langC, langJava,
  64. langYaml, langPython, langCmd, langConsole
  65. TokenClass* = enum
  66. gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
  67. gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
  68. gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
  69. gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
  70. gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
  71. gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
  72. gtReference, gtPrompt, gtProgramOutput, gtProgram, gtOption, gtOther
  73. GeneralTokenizer* = object of RootObj
  74. kind*: TokenClass
  75. start*, length*: int
  76. buf: cstring
  77. pos: int
  78. state: TokenClass
  79. lang: SourceLanguage
  80. const
  81. sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
  82. "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
  83. sourceLanguageToAlpha*: array[SourceLanguage, string] = ["none",
  84. "Nim", "cpp", "csharp", "C", "Java", "Yaml", "Python", "Cmd", "Console"]
  85. ## list of languages spelled with alpabetic characters
  86. tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
  87. "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
  88. "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
  89. "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
  90. "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
  91. "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
  92. "Label", "Reference", "Prompt", "ProgramOutput",
  93. # start from lower-case if there is a corresponding RST role (see rst.nim)
  94. "program", "option",
  95. "Other"]
  96. # The following list comes from doc/keywords.txt, make sure it is
  97. # synchronized with this array by running the module itself as a test case.
  98. nimKeywords = ["addr", "and", "as", "asm", "bind", "block",
  99. "break", "case", "cast", "concept", "const", "continue", "converter",
  100. "defer", "discard", "distinct", "div", "do",
  101. "elif", "else", "end", "enum", "except", "export",
  102. "finally", "for", "from", "func",
  103. "if", "import", "in", "include",
  104. "interface", "is", "isnot", "iterator", "let", "macro", "method",
  105. "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
  106. "ptr", "raise", "ref", "return", "shl", "shr", "static",
  107. "template", "try", "tuple", "type", "using", "var", "when", "while",
  108. "xor", "yield"]
  109. proc getSourceLanguage*(name: string): SourceLanguage =
  110. for i in succ(low(SourceLanguage)) .. high(SourceLanguage):
  111. if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
  112. return i
  113. if cmpIgnoreStyle(name, sourceLanguageToAlpha[i]) == 0:
  114. return i
  115. result = langNone
  116. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
  117. g.buf = buf
  118. g.kind = low(TokenClass)
  119. g.start = 0
  120. g.length = 0
  121. g.state = low(TokenClass)
  122. g.lang = low(SourceLanguage)
  123. g.pos = 0
  124. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
  125. initGeneralTokenizer(g, cstring(buf))
  126. proc deinitGeneralTokenizer*(g: var GeneralTokenizer) =
  127. discard
  128. proc nimGetKeyword(id: string): TokenClass =
  129. for k in nimKeywords:
  130. if cmpIgnoreStyle(id, k) == 0: return gtKeyword
  131. result = gtIdentifier
  132. when false:
  133. var i = getIdent(id)
  134. if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
  135. (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)):
  136. result = gtKeyword
  137. else:
  138. result = gtIdentifier
  139. proc nimNumberPostfix(g: var GeneralTokenizer, position: int): int =
  140. var pos = position
  141. if g.buf[pos] == '\'':
  142. inc(pos)
  143. case g.buf[pos]
  144. of 'f', 'F':
  145. g.kind = gtFloatNumber
  146. inc(pos)
  147. if g.buf[pos] in {'0'..'9'}: inc(pos)
  148. if g.buf[pos] in {'0'..'9'}: inc(pos)
  149. of 'i', 'I':
  150. inc(pos)
  151. if g.buf[pos] in {'0'..'9'}: inc(pos)
  152. if g.buf[pos] in {'0'..'9'}: inc(pos)
  153. else:
  154. discard
  155. result = pos
  156. proc nimNumber(g: var GeneralTokenizer, position: int): int =
  157. const decChars = {'0'..'9', '_'}
  158. var pos = position
  159. g.kind = gtDecNumber
  160. while g.buf[pos] in decChars: inc(pos)
  161. if g.buf[pos] == '.':
  162. g.kind = gtFloatNumber
  163. inc(pos)
  164. while g.buf[pos] in decChars: inc(pos)
  165. if g.buf[pos] in {'e', 'E'}:
  166. g.kind = gtFloatNumber
  167. inc(pos)
  168. if g.buf[pos] in {'+', '-'}: inc(pos)
  169. while g.buf[pos] in decChars: inc(pos)
  170. result = nimNumberPostfix(g, pos)
  171. const
  172. OpChars = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
  173. '|', '=', '%', '&', '$', '@', '~', ':'}
  174. proc isKeyword(x: openArray[string], y: string): int =
  175. binarySearch(x, y)
  176. proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
  177. const
  178. hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
  179. octChars = {'0'..'7', '_'}
  180. binChars = {'0'..'1', '_'}
  181. SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  182. var pos = g.pos
  183. g.start = g.pos
  184. if g.state == gtStringLit:
  185. if g.buf[pos] == '\\':
  186. g.kind = gtEscapeSequence
  187. inc(pos)
  188. case g.buf[pos]
  189. of 'x', 'X':
  190. inc(pos)
  191. if g.buf[pos] in hexChars: inc(pos)
  192. if g.buf[pos] in hexChars: inc(pos)
  193. of '0'..'9':
  194. while g.buf[pos] in {'0'..'9'}: inc(pos)
  195. of '\0':
  196. g.state = gtNone
  197. else: inc(pos)
  198. else:
  199. g.kind = gtStringLit
  200. while true:
  201. case g.buf[pos]
  202. of '\\':
  203. break
  204. of '\0', '\r', '\n':
  205. g.state = gtNone
  206. break
  207. of '\"':
  208. inc(pos)
  209. g.state = gtNone
  210. break
  211. else: inc(pos)
  212. else:
  213. case g.buf[pos]
  214. of ' ', '\t'..'\r':
  215. g.kind = gtWhitespace
  216. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  217. of '#':
  218. g.kind = gtComment
  219. inc(pos)
  220. var isDoc = false
  221. if g.buf[pos] == '#':
  222. inc(pos)
  223. isDoc = true
  224. if g.buf[pos] == '[' and g.lang == langNim:
  225. g.kind = gtLongComment
  226. var nesting = 0
  227. while true:
  228. case g.buf[pos]
  229. of '\0': break
  230. of '#':
  231. if isDoc:
  232. if g.buf[pos+1] == '#' and g.buf[pos+2] == '[':
  233. inc nesting
  234. elif g.buf[pos+1] == '[':
  235. inc nesting
  236. inc pos
  237. of ']':
  238. if isDoc:
  239. if g.buf[pos+1] == '#' and g.buf[pos+2] == '#':
  240. if nesting == 0:
  241. inc(pos, 3)
  242. break
  243. dec nesting
  244. elif g.buf[pos+1] == '#':
  245. if nesting == 0:
  246. inc(pos, 2)
  247. break
  248. dec nesting
  249. inc pos
  250. else:
  251. inc pos
  252. else:
  253. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  254. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  255. var id = ""
  256. while g.buf[pos] in SymChars + {'_'}:
  257. add(id, g.buf[pos])
  258. inc(pos)
  259. if (g.buf[pos] == '\"'):
  260. if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'):
  261. inc(pos, 3)
  262. g.kind = gtLongStringLit
  263. while true:
  264. case g.buf[pos]
  265. of '\0':
  266. break
  267. of '\"':
  268. inc(pos)
  269. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  270. g.buf[pos+2] != '\"':
  271. inc(pos, 2)
  272. break
  273. else: inc(pos)
  274. else:
  275. g.kind = gtRawData
  276. inc(pos)
  277. while not (g.buf[pos] in {'\0', '\n', '\r'}):
  278. if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
  279. inc(pos)
  280. if g.buf[pos] == '\"': inc(pos)
  281. else:
  282. if g.lang == langNim:
  283. g.kind = nimGetKeyword(id)
  284. elif isKeyword(keywords, id) >= 0:
  285. g.kind = gtKeyword
  286. of '0':
  287. inc(pos)
  288. case g.buf[pos]
  289. of 'b', 'B':
  290. g.kind = gtBinNumber
  291. inc(pos)
  292. while g.buf[pos] in binChars: inc(pos)
  293. pos = nimNumberPostfix(g, pos)
  294. of 'x', 'X':
  295. g.kind = gtHexNumber
  296. inc(pos)
  297. while g.buf[pos] in hexChars: inc(pos)
  298. pos = nimNumberPostfix(g, pos)
  299. of 'o', 'O':
  300. g.kind = gtOctNumber
  301. inc(pos)
  302. while g.buf[pos] in octChars: inc(pos)
  303. pos = nimNumberPostfix(g, pos)
  304. else: pos = nimNumber(g, pos)
  305. of '1'..'9':
  306. pos = nimNumber(g, pos)
  307. of '\'':
  308. inc(pos)
  309. g.kind = gtCharLit
  310. while true:
  311. case g.buf[pos]
  312. of '\0', '\r', '\n':
  313. break
  314. of '\'':
  315. inc(pos)
  316. break
  317. of '\\':
  318. inc(pos, 2)
  319. else: inc(pos)
  320. of '\"':
  321. inc(pos)
  322. if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
  323. inc(pos, 2)
  324. g.kind = gtLongStringLit
  325. while true:
  326. case g.buf[pos]
  327. of '\0':
  328. break
  329. of '\"':
  330. inc(pos)
  331. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  332. g.buf[pos+2] != '\"':
  333. inc(pos, 2)
  334. break
  335. else: inc(pos)
  336. else:
  337. g.kind = gtStringLit
  338. while true:
  339. case g.buf[pos]
  340. of '\0', '\r', '\n':
  341. break
  342. of '\"':
  343. inc(pos)
  344. break
  345. of '\\':
  346. g.state = g.kind
  347. break
  348. else: inc(pos)
  349. of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';':
  350. inc(pos)
  351. g.kind = gtPunctuation
  352. of '\0':
  353. g.kind = gtEof
  354. else:
  355. if g.buf[pos] in OpChars:
  356. g.kind = gtOperator
  357. while g.buf[pos] in OpChars: inc(pos)
  358. else:
  359. inc(pos)
  360. g.kind = gtNone
  361. g.length = pos - g.pos
  362. if g.kind != gtEof and g.state != gtNone and g.length <= 0:
  363. assert false, "nimNextToken: produced an empty token"
  364. g.pos = pos
  365. proc generalNumber(g: var GeneralTokenizer, position: int): int =
  366. const decChars = {'0'..'9'}
  367. var pos = position
  368. g.kind = gtDecNumber
  369. while g.buf[pos] in decChars: inc(pos)
  370. if g.buf[pos] == '.':
  371. g.kind = gtFloatNumber
  372. inc(pos)
  373. while g.buf[pos] in decChars: inc(pos)
  374. if g.buf[pos] in {'e', 'E'}:
  375. g.kind = gtFloatNumber
  376. inc(pos)
  377. if g.buf[pos] in {'+', '-'}: inc(pos)
  378. while g.buf[pos] in decChars: inc(pos)
  379. result = pos
  380. proc generalStrLit(g: var GeneralTokenizer, position: int): int =
  381. const
  382. decChars = {'0'..'9'}
  383. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  384. var pos = position
  385. g.kind = gtStringLit
  386. var c = g.buf[pos]
  387. inc(pos) # skip " or '
  388. while true:
  389. case g.buf[pos]
  390. of '\0':
  391. break
  392. of '\\':
  393. inc(pos)
  394. case g.buf[pos]
  395. of '\0':
  396. break
  397. of '0'..'9':
  398. while g.buf[pos] in decChars: inc(pos)
  399. of 'x', 'X':
  400. inc(pos)
  401. if g.buf[pos] in hexChars: inc(pos)
  402. if g.buf[pos] in hexChars: inc(pos)
  403. else: inc(pos, 2)
  404. else:
  405. if g.buf[pos] == c:
  406. inc(pos)
  407. break
  408. else:
  409. inc(pos)
  410. result = pos
  411. type
  412. TokenizerFlag = enum
  413. hasPreprocessor, hasNestedComments
  414. TokenizerFlags = set[TokenizerFlag]
  415. proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
  416. flags: TokenizerFlags) =
  417. const
  418. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  419. octChars = {'0'..'7'}
  420. binChars = {'0'..'1'}
  421. symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
  422. var pos = g.pos
  423. g.start = g.pos
  424. if g.state == gtStringLit:
  425. g.kind = gtStringLit
  426. while true:
  427. case g.buf[pos]
  428. of '\\':
  429. g.kind = gtEscapeSequence
  430. inc(pos)
  431. case g.buf[pos]
  432. of 'x', 'X':
  433. inc(pos)
  434. if g.buf[pos] in hexChars: inc(pos)
  435. if g.buf[pos] in hexChars: inc(pos)
  436. of '0'..'9':
  437. while g.buf[pos] in {'0'..'9'}: inc(pos)
  438. of '\0':
  439. g.state = gtNone
  440. else: inc(pos)
  441. break
  442. of '\0', '\r', '\n':
  443. g.state = gtNone
  444. break
  445. of '\"':
  446. inc(pos)
  447. g.state = gtNone
  448. break
  449. else: inc(pos)
  450. else:
  451. case g.buf[pos]
  452. of ' ', '\t'..'\r':
  453. g.kind = gtWhitespace
  454. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  455. of '/':
  456. inc(pos)
  457. if g.buf[pos] == '/':
  458. g.kind = gtComment
  459. while not (g.buf[pos] in {'\0', '\n', '\r'}): inc(pos)
  460. elif g.buf[pos] == '*':
  461. g.kind = gtLongComment
  462. var nested = 0
  463. inc(pos)
  464. while true:
  465. case g.buf[pos]
  466. of '*':
  467. inc(pos)
  468. if g.buf[pos] == '/':
  469. inc(pos)
  470. if nested == 0: break
  471. of '/':
  472. inc(pos)
  473. if g.buf[pos] == '*':
  474. inc(pos)
  475. if hasNestedComments in flags: inc(nested)
  476. of '\0':
  477. break
  478. else: inc(pos)
  479. of '#':
  480. inc(pos)
  481. if hasPreprocessor in flags:
  482. g.kind = gtPreprocessor
  483. while g.buf[pos] in {' ', '\t'}: inc(pos)
  484. while g.buf[pos] in symChars: inc(pos)
  485. else:
  486. g.kind = gtOperator
  487. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  488. var id = ""
  489. while g.buf[pos] in symChars:
  490. add(id, g.buf[pos])
  491. inc(pos)
  492. if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
  493. else: g.kind = gtIdentifier
  494. of '0':
  495. inc(pos)
  496. case g.buf[pos]
  497. of 'b', 'B':
  498. inc(pos)
  499. while g.buf[pos] in binChars: inc(pos)
  500. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  501. of 'x', 'X':
  502. inc(pos)
  503. while g.buf[pos] in hexChars: inc(pos)
  504. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  505. of '0'..'7':
  506. inc(pos)
  507. while g.buf[pos] in octChars: inc(pos)
  508. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  509. else:
  510. pos = generalNumber(g, pos)
  511. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  512. of '1'..'9':
  513. pos = generalNumber(g, pos)
  514. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  515. of '\'':
  516. pos = generalStrLit(g, pos)
  517. g.kind = gtCharLit
  518. of '\"':
  519. inc(pos)
  520. g.kind = gtStringLit
  521. while true:
  522. case g.buf[pos]
  523. of '\0':
  524. break
  525. of '\"':
  526. inc(pos)
  527. break
  528. of '\\':
  529. g.state = g.kind
  530. break
  531. else: inc(pos)
  532. of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.':
  533. inc(pos)
  534. g.kind = gtPunctuation
  535. of '\0':
  536. g.kind = gtEof
  537. else:
  538. if g.buf[pos] in OpChars:
  539. g.kind = gtOperator
  540. while g.buf[pos] in OpChars: inc(pos)
  541. else:
  542. inc(pos)
  543. g.kind = gtNone
  544. g.length = pos - g.pos
  545. if g.kind != gtEof and g.length <= 0:
  546. assert false, "clikeNextToken: produced an empty token"
  547. g.pos = pos
  548. proc cNextToken(g: var GeneralTokenizer) =
  549. const
  550. keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto",
  551. "break", "case", "char", "const", "continue", "default", "do", "double",
  552. "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int",
  553. "long", "register", "restrict", "return", "short", "signed", "sizeof",
  554. "static", "struct", "switch", "typedef", "union", "unsigned", "void",
  555. "volatile", "while"]
  556. clikeNextToken(g, keywords, {hasPreprocessor})
  557. proc cppNextToken(g: var GeneralTokenizer) =
  558. const
  559. keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch",
  560. "char", "class", "const", "continue", "default", "delete", "do", "double",
  561. "else", "enum", "extern", "float", "for", "friend", "goto", "if",
  562. "inline", "int", "long", "new", "operator", "private", "protected",
  563. "public", "register", "return", "short", "signed", "sizeof", "static",
  564. "struct", "switch", "template", "this", "throw", "try", "typedef",
  565. "union", "unsigned", "virtual", "void", "volatile", "while"]
  566. clikeNextToken(g, keywords, {hasPreprocessor})
  567. proc csharpNextToken(g: var GeneralTokenizer) =
  568. const
  569. keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break",
  570. "byte", "case", "catch", "char", "checked", "class", "const", "continue",
  571. "decimal", "default", "delegate", "do", "double", "else", "enum", "event",
  572. "explicit", "extern", "false", "finally", "fixed", "float", "for",
  573. "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal",
  574. "is", "lock", "long", "namespace", "new", "null", "object", "operator",
  575. "out", "override", "params", "private", "protected", "public", "readonly",
  576. "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc",
  577. "static", "string", "struct", "switch", "this", "throw", "true", "try",
  578. "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using",
  579. "virtual", "void", "volatile", "while"]
  580. clikeNextToken(g, keywords, {hasPreprocessor})
  581. proc javaNextToken(g: var GeneralTokenizer) =
  582. const
  583. keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break",
  584. "byte", "case", "catch", "char", "class", "const", "continue", "default",
  585. "do", "double", "else", "enum", "extends", "false", "final", "finally",
  586. "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
  587. "interface", "long", "native", "new", "null", "package", "private",
  588. "protected", "public", "return", "short", "static", "strictfp", "super",
  589. "switch", "synchronized", "this", "throw", "throws", "transient", "true",
  590. "try", "void", "volatile", "while"]
  591. clikeNextToken(g, keywords, {})
  592. proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
  593. g.kind = gtStringLit
  594. while g.buf[pos] notin {'\0', '\t'..'\r', ',', ']', '}'}:
  595. if g.buf[pos] == ':' and
  596. g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
  597. break
  598. inc(pos)
  599. proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
  600. g.kind = gtNone
  601. if g.buf[pos] == '-': inc(pos)
  602. if g.buf[pos] == '0': inc(pos)
  603. elif g.buf[pos] in '1'..'9':
  604. inc(pos)
  605. while g.buf[pos] in {'0'..'9'}: inc(pos)
  606. else: yamlPlainStrLit(g, pos)
  607. if g.kind == gtNone:
  608. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  609. g.kind = gtDecNumber
  610. elif g.buf[pos] == '.':
  611. inc(pos)
  612. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  613. else:
  614. while g.buf[pos] in {'0'..'9'}: inc(pos)
  615. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  616. g.kind = gtFloatNumber
  617. if g.kind == gtNone:
  618. if g.buf[pos] in {'e', 'E'}:
  619. inc(pos)
  620. if g.buf[pos] in {'-', '+'}: inc(pos)
  621. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  622. else:
  623. while g.buf[pos] in {'0'..'9'}: inc(pos)
  624. if g.buf[pos] in {'\0', '\t'..'\r', ' ', ',', ']', '}'}:
  625. g.kind = gtFloatNumber
  626. else: yamlPlainStrLit(g, pos)
  627. else: yamlPlainStrLit(g, pos)
  628. while g.buf[pos] notin {'\0', ',', ']', '}', '\n', '\r'}:
  629. inc(pos)
  630. if g.buf[pos] notin {'\t'..'\r', ' ', ',', ']', '}'}:
  631. yamlPlainStrLit(g, pos)
  632. break
  633. # theoretically, we would need to parse indentation (like with block scalars)
  634. # because of possible multiline flow scalars that start with number-like
  635. # content, but that is far too troublesome. I think it is fine that the
  636. # highlighter is sloppy here.
  637. proc yamlNextToken(g: var GeneralTokenizer) =
  638. const
  639. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  640. var pos = g.pos
  641. g.start = g.pos
  642. if g.state == gtStringLit:
  643. g.kind = gtStringLit
  644. while true:
  645. case g.buf[pos]
  646. of '\\':
  647. if pos != g.pos: break
  648. g.kind = gtEscapeSequence
  649. inc(pos)
  650. case g.buf[pos]
  651. of 'x':
  652. inc(pos)
  653. for i in 1..2:
  654. if g.buf[pos] in hexChars: inc(pos)
  655. break
  656. of 'u':
  657. inc(pos)
  658. for i in 1..4:
  659. if g.buf[pos] in hexChars: inc(pos)
  660. break
  661. of 'U':
  662. inc(pos)
  663. for i in 1..8:
  664. if g.buf[pos] in hexChars: inc(pos)
  665. break
  666. else: inc(pos)
  667. break
  668. of '\0':
  669. g.state = gtOther
  670. break
  671. of '\"':
  672. inc(pos)
  673. g.state = gtOther
  674. break
  675. else: inc(pos)
  676. elif g.state == gtCharLit:
  677. # abusing gtCharLit as single-quoted string lit
  678. g.kind = gtStringLit
  679. inc(pos) # skip the starting '
  680. while true:
  681. case g.buf[pos]
  682. of '\'':
  683. inc(pos)
  684. if g.buf[pos] == '\'':
  685. inc(pos)
  686. g.kind = gtEscapeSequence
  687. else: g.state = gtOther
  688. break
  689. else: inc(pos)
  690. elif g.state == gtCommand:
  691. # gtCommand means 'block scalar header'
  692. case g.buf[pos]
  693. of ' ', '\t':
  694. g.kind = gtWhitespace
  695. while g.buf[pos] in {' ', '\t'}: inc(pos)
  696. of '#':
  697. g.kind = gtComment
  698. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  699. of '\n', '\r': discard
  700. else:
  701. # illegal here. just don't parse a block scalar
  702. g.kind = gtNone
  703. g.state = gtOther
  704. if g.buf[pos] in {'\n', '\r'} and g.state == gtCommand:
  705. g.state = gtLongStringLit
  706. elif g.state == gtLongStringLit:
  707. # beware, this is the only token where we actually have to parse
  708. # indentation.
  709. g.kind = gtLongStringLit
  710. # first, we have to find the parent indentation of the block scalar, so that
  711. # we know when to stop
  712. assert g.buf[pos] in {'\n', '\r'}
  713. var lookbehind = pos - 1
  714. var headerStart = -1
  715. while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
  716. if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
  717. headerStart = lookbehind
  718. dec(lookbehind)
  719. assert headerStart != -1
  720. var indentation = 1
  721. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  722. if g.buf[lookbehind + indentation] in {'|', '>'}:
  723. # when the header is alone in a line, this line does not show the parent's
  724. # indentation, so we must go further. search the first previous line with
  725. # non-whitespace content.
  726. while lookbehind >= 0 and g.buf[lookbehind] in {'\n', '\r'}:
  727. dec(lookbehind)
  728. while lookbehind >= 0 and
  729. g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
  730. # now, find the beginning of the line...
  731. while lookbehind >= 0 and g.buf[lookbehind] notin {'\n', '\r'}:
  732. dec(lookbehind)
  733. # ... and its indentation
  734. indentation = 1
  735. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  736. if lookbehind == -1: indentation = 0 # top level
  737. elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
  738. g.buf[lookbehind + 3] == '-' and
  739. g.buf[lookbehind + 4] in {'\t'..'\r', ' '}:
  740. # this is a document start, therefore, we are at top level
  741. indentation = 0
  742. # because lookbehind was at newline char when calculating indentation, we're
  743. # off by one. fix that. top level's parent will have indentation of -1.
  744. let parentIndentation = indentation - 1
  745. # find first content
  746. while g.buf[pos] in {' ', '\n', '\r'}:
  747. if g.buf[pos] == ' ': inc(indentation)
  748. else: indentation = 0
  749. inc(pos)
  750. var minIndentation = indentation
  751. # for stupid edge cases, we must check whether an explicit indentation depth
  752. # is given at the header.
  753. while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart)
  754. if g.buf[headerStart] in {'0'..'9'}:
  755. minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0'))
  756. # process content lines
  757. while indentation > parentIndentation and g.buf[pos] != '\0':
  758. if (indentation < minIndentation and g.buf[pos] == '#') or
  759. (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
  760. g.buf[pos + 2] == '.' and
  761. g.buf[pos + 3] in {'\0', '\t'..'\r', ' '}):
  762. # comment after end of block scalar, or end of document
  763. break
  764. minIndentation = min(indentation, minIndentation)
  765. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  766. while g.buf[pos] in {' ', '\n', '\r'}:
  767. if g.buf[pos] == ' ': inc(indentation)
  768. else: indentation = 0
  769. inc(pos)
  770. g.state = gtOther
  771. elif g.state == gtOther:
  772. # gtOther means 'inside YAML document'
  773. case g.buf[pos]
  774. of ' ', '\t'..'\r':
  775. g.kind = gtWhitespace
  776. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  777. of '#':
  778. g.kind = gtComment
  779. inc(pos)
  780. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  781. of '-':
  782. inc(pos)
  783. if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
  784. g.kind = gtPunctuation
  785. elif g.buf[pos] == '-' and
  786. (pos == 1 or g.buf[pos - 2] in {'\n', '\r'}): # start of line
  787. inc(pos)
  788. if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\t'..'\r', ' '}:
  789. inc(pos)
  790. g.kind = gtKeyword
  791. else: yamlPossibleNumber(g, pos)
  792. else: yamlPossibleNumber(g, pos)
  793. of '.':
  794. if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
  795. inc(pos)
  796. for i in 1..2:
  797. if g.buf[pos] != '.': break
  798. inc(pos)
  799. if pos == g.start + 3:
  800. g.kind = gtKeyword
  801. g.state = gtNone
  802. else: yamlPlainStrLit(g, pos)
  803. else: yamlPlainStrLit(g, pos)
  804. of '?':
  805. inc(pos)
  806. if g.buf[pos] in {'\0', ' ', '\t'..'\r'}:
  807. g.kind = gtPunctuation
  808. else: yamlPlainStrLit(g, pos)
  809. of ':':
  810. inc(pos)
  811. if g.buf[pos] in {'\0', '\t'..'\r', ' ', '\'', '\"'} or
  812. (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
  813. g.kind = gtPunctuation
  814. else: yamlPlainStrLit(g, pos)
  815. of '[', ']', '{', '}', ',':
  816. inc(pos)
  817. g.kind = gtPunctuation
  818. of '\"':
  819. inc(pos)
  820. g.state = gtStringLit
  821. g.kind = gtStringLit
  822. of '\'':
  823. g.state = gtCharLit
  824. g.kind = gtNone
  825. of '!':
  826. g.kind = gtTagStart
  827. inc(pos)
  828. if g.buf[pos] == '<':
  829. # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
  830. while g.buf[pos] notin {'\0', '>', '\t'..'\r', ' '}: inc(pos)
  831. if g.buf[pos] == '>': inc(pos)
  832. else:
  833. while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
  834. case g.buf[pos]
  835. of '!':
  836. # prefixed tag (e.g. `!!str`)
  837. inc(pos)
  838. while g.buf[pos] notin
  839. {'\0', '\t'..'\r', ' ', ',', '[', ']', '{', '}'}: inc(pos)
  840. of '\0', '\t'..'\r', ' ': discard
  841. else:
  842. # local tag (e.g. `!nim:system:int`)
  843. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  844. of '&':
  845. g.kind = gtLabel
  846. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  847. of '*':
  848. g.kind = gtReference
  849. while g.buf[pos] notin {'\0', '\t'..'\r', ' '}: inc(pos)
  850. of '|', '>':
  851. # this can lead to incorrect tokenization when | or > appear inside flow
  852. # content. checking whether we're inside flow content is not
  853. # chomsky type-3, so we won't do that here.
  854. g.kind = gtCommand
  855. g.state = gtCommand
  856. inc(pos)
  857. while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos)
  858. of '0'..'9': yamlPossibleNumber(g, pos)
  859. of '\0': g.kind = gtEof
  860. else: yamlPlainStrLit(g, pos)
  861. else:
  862. # outside document
  863. case g.buf[pos]
  864. of '%':
  865. if pos == 0 or g.buf[pos - 1] in {'\n', '\r'}:
  866. g.kind = gtDirective
  867. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  868. else:
  869. g.state = gtOther
  870. yamlPlainStrLit(g, pos)
  871. of ' ', '\t'..'\r':
  872. g.kind = gtWhitespace
  873. while g.buf[pos] in {' ', '\t'..'\r'}: inc(pos)
  874. of '#':
  875. g.kind = gtComment
  876. while g.buf[pos] notin {'\0', '\n', '\r'}: inc(pos)
  877. of '\0': g.kind = gtEof
  878. else:
  879. g.kind = gtNone
  880. g.state = gtOther
  881. g.length = pos - g.pos
  882. g.pos = pos
  883. proc pythonNextToken(g: var GeneralTokenizer) =
  884. const
  885. keywords: array[0..34, string] = [
  886. "False", "None", "True", "and", "as", "assert", "async", "await",
  887. "break", "class", "continue", "def", "del", "elif", "else", "except",
  888. "finally", "for", "from", "global", "if", "import", "in", "is", "lambda",
  889. "nonlocal", "not", "or", "pass", "raise", "return", "try", "while",
  890. "with", "yield"]
  891. nimNextToken(g, keywords)
  892. proc cmdNextToken(g: var GeneralTokenizer, dollarPrompt = false) =
  893. var pos = g.pos
  894. g.start = g.pos
  895. if g.state == low(TokenClass):
  896. g.state = if dollarPrompt: gtPrompt else: gtProgram
  897. case g.buf[pos]
  898. of ' ', '\t'..'\r':
  899. g.kind = gtWhitespace
  900. while g.buf[pos] in {' ', '\t'..'\r'}:
  901. if g.buf[pos] == '\n':
  902. g.state = if dollarPrompt: gtPrompt else: gtProgram
  903. inc(pos)
  904. of '\'', '"':
  905. g.kind = gtOption
  906. let q = g.buf[pos]
  907. inc(pos)
  908. while g.buf[pos] notin {q, '\0'}:
  909. inc(pos)
  910. if g.buf[pos] == q: inc(pos)
  911. of '#':
  912. g.kind = gtComment
  913. while g.buf[pos] notin {'\n', '\0'}:
  914. inc(pos)
  915. of '&', '|':
  916. g.kind = gtOperator
  917. inc(pos)
  918. if g.buf[pos] == g.buf[pos-1]: inc(pos)
  919. g.state = gtProgram
  920. of '(':
  921. g.kind = gtOperator
  922. g.state = gtProgram
  923. inc(pos)
  924. of ')':
  925. g.kind = gtOperator
  926. inc(pos)
  927. of ';':
  928. g.state = gtProgram
  929. g.kind = gtOperator
  930. inc(pos)
  931. of '\0': g.kind = gtEof
  932. elif dollarPrompt and g.state == gtPrompt:
  933. if g.buf[pos] == '$' and g.buf[pos+1] in {' ', '\t'}:
  934. g.kind = gtPrompt
  935. inc pos, 2
  936. g.state = gtProgram
  937. else:
  938. g.kind = gtProgramOutput
  939. while g.buf[pos] notin {'\n', '\0'}:
  940. inc(pos)
  941. else:
  942. if g.state == gtProgram:
  943. g.kind = gtProgram
  944. g.state = gtOption
  945. else:
  946. g.kind = gtOption
  947. while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}:
  948. if g.buf[pos] == ';' and g.buf[pos+1] == ' ':
  949. # (check space because ';' can be used inside arguments in Win bat)
  950. break
  951. if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}:
  952. g.kind = gtIdentifier # for file/dir name
  953. elif g.kind == gtProgram and g.buf[pos] == '=':
  954. g.kind = gtIdentifier # for env variable setting at beginning of line
  955. g.state = gtProgram
  956. inc(pos)
  957. g.length = pos - g.pos
  958. g.pos = pos
  959. proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  960. g.lang = lang
  961. case lang
  962. of langNone: assert false
  963. of langNim: nimNextToken(g)
  964. of langCpp: cppNextToken(g)
  965. of langCsharp: csharpNextToken(g)
  966. of langC: cNextToken(g)
  967. of langJava: javaNextToken(g)
  968. of langYaml: yamlNextToken(g)
  969. of langPython: pythonNextToken(g)
  970. of langCmd: cmdNextToken(g)
  971. of langConsole: cmdNextToken(g, dollarPrompt=true)
  972. proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] =
  973. var g: GeneralTokenizer
  974. initGeneralTokenizer(g, text)
  975. var prevPos = 0
  976. while true:
  977. getNextToken(g, lang)
  978. if g.kind == gtEof:
  979. break
  980. var s = text[prevPos ..< g.pos]
  981. result.add (s, g.kind)
  982. prevPos = g.pos
  983. when isMainModule:
  984. var keywords: seq[string]
  985. # Try to work running in both the subdir or at the root.
  986. for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
  987. try:
  988. let input = readFile(filename)
  989. keywords = input.splitWhitespace()
  990. break
  991. except:
  992. echo filename, " not found"
  993. doAssert(keywords.len > 0, "Couldn't read any keywords.txt file!")
  994. for i in 0..min(keywords.len, nimKeywords.len)-1:
  995. doAssert keywords[i] == nimKeywords[i], "Unexpected keyword"
  996. doAssert keywords.len == nimKeywords.len, "No matching lengths"