highlite.nim 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Source highlighter for programming or markup languages.
  10. ## Currently only few languages are supported, other languages may be added.
  11. ## The interface supports one language nested in another.
  12. ##
  13. ## **Note:** Import ``packages/docutils/highlite`` to use this module
  14. ##
  15. ## You can use this to build your own syntax highlighting, check this example:
  16. ##
  17. ## .. code::nim
  18. ## let code = """for x in $int.high: echo x.ord mod 2 == 0"""
  19. ## var toknizr: GeneralTokenizer
  20. ## initGeneralTokenizer(toknizr, code)
  21. ## while true:
  22. ## getNextToken(toknizr, langNim)
  23. ## case toknizr.kind
  24. ## of gtEof: break # End Of File (or string)
  25. ## of gtWhitespace:
  26. ## echo gtWhitespace # Maybe you want "visible" whitespaces?.
  27. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  28. ## of gtOperator:
  29. ## echo gtOperator # Maybe you want Operators to use a specific color?.
  30. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  31. ## # of gtSomeSymbol: syntaxHighlight("Comic Sans", "bold", "99px", "pink")
  32. ## else:
  33. ## echo toknizr.kind # All the kinds of tokens can be processed here.
  34. ## echo substr(code, toknizr.start, toknizr.length + toknizr.start - 1)
  35. ##
  36. ## The proc ``getSourceLanguage`` can get the language ``enum`` from a string:
  37. ##
  38. ## .. code::nim
  39. ## for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
  40. ##
  41. import
  42. strutils
  43. from algorithm import binarySearch
  44. type
  45. TokenClass* = enum
  46. gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
  47. gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
  48. gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
  49. gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
  50. gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
  51. gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
  52. gtReference, gtOther
  53. GeneralTokenizer* = object of RootObj
  54. kind*: TokenClass
  55. start*, length*: int
  56. buf: cstring
  57. pos: int
  58. state: TokenClass
  59. SourceLanguage* = enum
  60. langNone, langNim, langCpp, langCsharp, langC, langJava,
  61. langYaml
  62. const
  63. sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
  64. "Nim", "C++", "C#", "C", "Java", "Yaml"]
  65. tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
  66. "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
  67. "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
  68. "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
  69. "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
  70. "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
  71. "Label", "Reference", "Other"]
  72. # The following list comes from doc/keywords.txt, make sure it is
  73. # synchronized with this array by running the module itself as a test case.
  74. nimKeywords = ["addr", "and", "as", "asm", "bind", "block",
  75. "break", "case", "cast", "concept", "const", "continue", "converter",
  76. "defer", "discard", "distinct", "div", "do",
  77. "elif", "else", "end", "enum", "except", "export",
  78. "finally", "for", "from", "func",
  79. "if", "import", "in", "include",
  80. "interface", "is", "isnot", "iterator", "let", "macro", "method",
  81. "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
  82. "ptr", "raise", "ref", "return", "shl", "shr", "static",
  83. "template", "try", "tuple", "type", "using", "var", "when", "while",
  84. "xor", "yield"]
  85. proc getSourceLanguage*(name: string): SourceLanguage =
  86. for i in countup(succ(low(SourceLanguage)), high(SourceLanguage)):
  87. if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
  88. return i
  89. result = langNone
  90. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
  91. g.buf = buf
  92. g.kind = low(TokenClass)
  93. g.start = 0
  94. g.length = 0
  95. g.state = low(TokenClass)
  96. var pos = 0 # skip initial whitespace:
  97. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  98. g.pos = pos
  99. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
  100. initGeneralTokenizer(g, cstring(buf))
  101. proc deinitGeneralTokenizer*(g: var GeneralTokenizer) =
  102. discard
  103. proc nimGetKeyword(id: string): TokenClass =
  104. for k in nimKeywords:
  105. if cmpIgnoreStyle(id, k) == 0: return gtKeyword
  106. result = gtIdentifier
  107. when false:
  108. var i = getIdent(id)
  109. if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
  110. (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)):
  111. result = gtKeyword
  112. else:
  113. result = gtIdentifier
  114. proc nimNumberPostfix(g: var GeneralTokenizer, position: int): int =
  115. var pos = position
  116. if g.buf[pos] == '\'':
  117. inc(pos)
  118. case g.buf[pos]
  119. of 'f', 'F':
  120. g.kind = gtFloatNumber
  121. inc(pos)
  122. if g.buf[pos] in {'0'..'9'}: inc(pos)
  123. if g.buf[pos] in {'0'..'9'}: inc(pos)
  124. of 'i', 'I':
  125. inc(pos)
  126. if g.buf[pos] in {'0'..'9'}: inc(pos)
  127. if g.buf[pos] in {'0'..'9'}: inc(pos)
  128. else:
  129. discard
  130. result = pos
  131. proc nimNumber(g: var GeneralTokenizer, position: int): int =
  132. const decChars = {'0'..'9', '_'}
  133. var pos = position
  134. g.kind = gtDecNumber
  135. while g.buf[pos] in decChars: inc(pos)
  136. if g.buf[pos] == '.':
  137. g.kind = gtFloatNumber
  138. inc(pos)
  139. while g.buf[pos] in decChars: inc(pos)
  140. if g.buf[pos] in {'e', 'E'}:
  141. g.kind = gtFloatNumber
  142. inc(pos)
  143. if g.buf[pos] in {'+', '-'}: inc(pos)
  144. while g.buf[pos] in decChars: inc(pos)
  145. result = nimNumberPostfix(g, pos)
  146. const
  147. OpChars = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
  148. '|', '=', '%', '&', '$', '@', '~', ':'}
  149. proc nimNextToken(g: var GeneralTokenizer) =
  150. const
  151. hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
  152. octChars = {'0'..'7', '_'}
  153. binChars = {'0'..'1', '_'}
  154. SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  155. var pos = g.pos
  156. g.start = g.pos
  157. if g.state == gtStringLit:
  158. g.kind = gtStringLit
  159. while true:
  160. case g.buf[pos]
  161. of '\\':
  162. g.kind = gtEscapeSequence
  163. inc(pos)
  164. case g.buf[pos]
  165. of 'x', 'X':
  166. inc(pos)
  167. if g.buf[pos] in hexChars: inc(pos)
  168. if g.buf[pos] in hexChars: inc(pos)
  169. of '0'..'9':
  170. while g.buf[pos] in {'0'..'9'}: inc(pos)
  171. of '\0':
  172. g.state = gtNone
  173. else: inc(pos)
  174. break
  175. of '\0', '\x0D', '\x0A':
  176. g.state = gtNone
  177. break
  178. of '\"':
  179. inc(pos)
  180. g.state = gtNone
  181. break
  182. else: inc(pos)
  183. else:
  184. case g.buf[pos]
  185. of ' ', '\x09'..'\x0D':
  186. g.kind = gtWhitespace
  187. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  188. of '#':
  189. g.kind = gtComment
  190. inc(pos)
  191. var isDoc = false
  192. if g.buf[pos] == '#':
  193. inc(pos)
  194. isDoc = true
  195. if g.buf[pos] == '[':
  196. g.kind = gtLongComment
  197. var nesting = 0
  198. while true:
  199. case g.buf[pos]
  200. of '\0': break
  201. of '#':
  202. if isDoc:
  203. if g.buf[pos+1] == '#' and g.buf[pos+2] == '[':
  204. inc nesting
  205. elif g.buf[pos+1] == '[':
  206. inc nesting
  207. inc pos
  208. of ']':
  209. if isDoc:
  210. if g.buf[pos+1] == '#' and g.buf[pos+2] == '#':
  211. if nesting == 0:
  212. inc(pos, 3)
  213. break
  214. dec nesting
  215. elif g.buf[pos+1] == '#':
  216. if nesting == 0:
  217. inc(pos, 2)
  218. break
  219. dec nesting
  220. inc pos
  221. else:
  222. inc pos
  223. else:
  224. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  225. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  226. var id = ""
  227. while g.buf[pos] in SymChars + {'_'}:
  228. add(id, g.buf[pos])
  229. inc(pos)
  230. if (g.buf[pos] == '\"'):
  231. if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'):
  232. inc(pos, 3)
  233. g.kind = gtLongStringLit
  234. while true:
  235. case g.buf[pos]
  236. of '\0':
  237. break
  238. of '\"':
  239. inc(pos)
  240. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  241. g.buf[pos+2] != '\"':
  242. inc(pos, 2)
  243. break
  244. else: inc(pos)
  245. else:
  246. g.kind = gtRawData
  247. inc(pos)
  248. while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}):
  249. if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
  250. inc(pos)
  251. if g.buf[pos] == '\"': inc(pos)
  252. else:
  253. g.kind = nimGetKeyword(id)
  254. of '0':
  255. inc(pos)
  256. case g.buf[pos]
  257. of 'b', 'B':
  258. g.kind = gtBinNumber
  259. inc(pos)
  260. while g.buf[pos] in binChars: inc(pos)
  261. pos = nimNumberPostfix(g, pos)
  262. of 'x', 'X':
  263. g.kind = gtHexNumber
  264. inc(pos)
  265. while g.buf[pos] in hexChars: inc(pos)
  266. pos = nimNumberPostfix(g, pos)
  267. of 'o', 'O':
  268. g.kind = gtOctNumber
  269. inc(pos)
  270. while g.buf[pos] in octChars: inc(pos)
  271. pos = nimNumberPostfix(g, pos)
  272. else: pos = nimNumber(g, pos)
  273. of '1'..'9':
  274. pos = nimNumber(g, pos)
  275. of '\'':
  276. inc(pos)
  277. g.kind = gtCharLit
  278. while true:
  279. case g.buf[pos]
  280. of '\0', '\x0D', '\x0A':
  281. break
  282. of '\'':
  283. inc(pos)
  284. break
  285. of '\\':
  286. inc(pos, 2)
  287. else: inc(pos)
  288. of '\"':
  289. inc(pos)
  290. if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
  291. inc(pos, 2)
  292. g.kind = gtLongStringLit
  293. while true:
  294. case g.buf[pos]
  295. of '\0':
  296. break
  297. of '\"':
  298. inc(pos)
  299. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  300. g.buf[pos+2] != '\"':
  301. inc(pos, 2)
  302. break
  303. else: inc(pos)
  304. else:
  305. g.kind = gtStringLit
  306. while true:
  307. case g.buf[pos]
  308. of '\0', '\x0D', '\x0A':
  309. break
  310. of '\"':
  311. inc(pos)
  312. break
  313. of '\\':
  314. g.state = g.kind
  315. break
  316. else: inc(pos)
  317. of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';':
  318. inc(pos)
  319. g.kind = gtPunctuation
  320. of '\0':
  321. g.kind = gtEof
  322. else:
  323. if g.buf[pos] in OpChars:
  324. g.kind = gtOperator
  325. while g.buf[pos] in OpChars: inc(pos)
  326. else:
  327. inc(pos)
  328. g.kind = gtNone
  329. g.length = pos - g.pos
  330. if g.kind != gtEof and g.length <= 0:
  331. assert false, "nimNextToken: produced an empty token"
  332. g.pos = pos
  333. proc generalNumber(g: var GeneralTokenizer, position: int): int =
  334. const decChars = {'0'..'9'}
  335. var pos = position
  336. g.kind = gtDecNumber
  337. while g.buf[pos] in decChars: inc(pos)
  338. if g.buf[pos] == '.':
  339. g.kind = gtFloatNumber
  340. inc(pos)
  341. while g.buf[pos] in decChars: inc(pos)
  342. if g.buf[pos] in {'e', 'E'}:
  343. g.kind = gtFloatNumber
  344. inc(pos)
  345. if g.buf[pos] in {'+', '-'}: inc(pos)
  346. while g.buf[pos] in decChars: inc(pos)
  347. result = pos
  348. proc generalStrLit(g: var GeneralTokenizer, position: int): int =
  349. const
  350. decChars = {'0'..'9'}
  351. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  352. var pos = position
  353. g.kind = gtStringLit
  354. var c = g.buf[pos]
  355. inc(pos) # skip " or '
  356. while true:
  357. case g.buf[pos]
  358. of '\0':
  359. break
  360. of '\\':
  361. inc(pos)
  362. case g.buf[pos]
  363. of '\0':
  364. break
  365. of '0'..'9':
  366. while g.buf[pos] in decChars: inc(pos)
  367. of 'x', 'X':
  368. inc(pos)
  369. if g.buf[pos] in hexChars: inc(pos)
  370. if g.buf[pos] in hexChars: inc(pos)
  371. else: inc(pos, 2)
  372. else:
  373. if g.buf[pos] == c:
  374. inc(pos)
  375. break
  376. else:
  377. inc(pos)
  378. result = pos
  379. proc isKeyword(x: openArray[string], y: string): int =
  380. binarySearch(x, y)
  381. proc isKeywordIgnoreCase(x: openArray[string], y: string): int =
  382. binarySearch(x, y, cmpIgnoreCase)
  383. type
  384. TokenizerFlag = enum
  385. hasPreprocessor, hasNestedComments
  386. TokenizerFlags = set[TokenizerFlag]
  387. proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
  388. flags: TokenizerFlags) =
  389. const
  390. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  391. octChars = {'0'..'7'}
  392. binChars = {'0'..'1'}
  393. symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
  394. var pos = g.pos
  395. g.start = g.pos
  396. if g.state == gtStringLit:
  397. g.kind = gtStringLit
  398. while true:
  399. case g.buf[pos]
  400. of '\\':
  401. g.kind = gtEscapeSequence
  402. inc(pos)
  403. case g.buf[pos]
  404. of 'x', 'X':
  405. inc(pos)
  406. if g.buf[pos] in hexChars: inc(pos)
  407. if g.buf[pos] in hexChars: inc(pos)
  408. of '0'..'9':
  409. while g.buf[pos] in {'0'..'9'}: inc(pos)
  410. of '\0':
  411. g.state = gtNone
  412. else: inc(pos)
  413. break
  414. of '\0', '\x0D', '\x0A':
  415. g.state = gtNone
  416. break
  417. of '\"':
  418. inc(pos)
  419. g.state = gtNone
  420. break
  421. else: inc(pos)
  422. else:
  423. case g.buf[pos]
  424. of ' ', '\x09'..'\x0D':
  425. g.kind = gtWhitespace
  426. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  427. of '/':
  428. inc(pos)
  429. if g.buf[pos] == '/':
  430. g.kind = gtComment
  431. while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
  432. elif g.buf[pos] == '*':
  433. g.kind = gtLongComment
  434. var nested = 0
  435. inc(pos)
  436. while true:
  437. case g.buf[pos]
  438. of '*':
  439. inc(pos)
  440. if g.buf[pos] == '/':
  441. inc(pos)
  442. if nested == 0: break
  443. of '/':
  444. inc(pos)
  445. if g.buf[pos] == '*':
  446. inc(pos)
  447. if hasNestedComments in flags: inc(nested)
  448. of '\0':
  449. break
  450. else: inc(pos)
  451. of '#':
  452. inc(pos)
  453. if hasPreprocessor in flags:
  454. g.kind = gtPreprocessor
  455. while g.buf[pos] in {' ', '\t'}: inc(pos)
  456. while g.buf[pos] in symChars: inc(pos)
  457. else:
  458. g.kind = gtOperator
  459. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  460. var id = ""
  461. while g.buf[pos] in symChars:
  462. add(id, g.buf[pos])
  463. inc(pos)
  464. if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
  465. else: g.kind = gtIdentifier
  466. of '0':
  467. inc(pos)
  468. case g.buf[pos]
  469. of 'b', 'B':
  470. inc(pos)
  471. while g.buf[pos] in binChars: inc(pos)
  472. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  473. of 'x', 'X':
  474. inc(pos)
  475. while g.buf[pos] in hexChars: inc(pos)
  476. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  477. of '0'..'7':
  478. inc(pos)
  479. while g.buf[pos] in octChars: inc(pos)
  480. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  481. else:
  482. pos = generalNumber(g, pos)
  483. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  484. of '1'..'9':
  485. pos = generalNumber(g, pos)
  486. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  487. of '\'':
  488. pos = generalStrLit(g, pos)
  489. g.kind = gtCharLit
  490. of '\"':
  491. inc(pos)
  492. g.kind = gtStringLit
  493. while true:
  494. case g.buf[pos]
  495. of '\0':
  496. break
  497. of '\"':
  498. inc(pos)
  499. break
  500. of '\\':
  501. g.state = g.kind
  502. break
  503. else: inc(pos)
  504. of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.':
  505. inc(pos)
  506. g.kind = gtPunctuation
  507. of '\0':
  508. g.kind = gtEof
  509. else:
  510. if g.buf[pos] in OpChars:
  511. g.kind = gtOperator
  512. while g.buf[pos] in OpChars: inc(pos)
  513. else:
  514. inc(pos)
  515. g.kind = gtNone
  516. g.length = pos - g.pos
  517. if g.kind != gtEof and g.length <= 0:
  518. assert false, "clikeNextToken: produced an empty token"
  519. g.pos = pos
  520. proc cNextToken(g: var GeneralTokenizer) =
  521. const
  522. keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto",
  523. "break", "case", "char", "const", "continue", "default", "do", "double",
  524. "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int",
  525. "long", "register", "restrict", "return", "short", "signed", "sizeof",
  526. "static", "struct", "switch", "typedef", "union", "unsigned", "void",
  527. "volatile", "while"]
  528. clikeNextToken(g, keywords, {hasPreprocessor})
  529. proc cppNextToken(g: var GeneralTokenizer) =
  530. const
  531. keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch",
  532. "char", "class", "const", "continue", "default", "delete", "do", "double",
  533. "else", "enum", "extern", "float", "for", "friend", "goto", "if",
  534. "inline", "int", "long", "new", "operator", "private", "protected",
  535. "public", "register", "return", "short", "signed", "sizeof", "static",
  536. "struct", "switch", "template", "this", "throw", "try", "typedef",
  537. "union", "unsigned", "virtual", "void", "volatile", "while"]
  538. clikeNextToken(g, keywords, {hasPreprocessor})
  539. proc csharpNextToken(g: var GeneralTokenizer) =
  540. const
  541. keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break",
  542. "byte", "case", "catch", "char", "checked", "class", "const", "continue",
  543. "decimal", "default", "delegate", "do", "double", "else", "enum", "event",
  544. "explicit", "extern", "false", "finally", "fixed", "float", "for",
  545. "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal",
  546. "is", "lock", "long", "namespace", "new", "null", "object", "operator",
  547. "out", "override", "params", "private", "protected", "public", "readonly",
  548. "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc",
  549. "static", "string", "struct", "switch", "this", "throw", "true", "try",
  550. "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using",
  551. "virtual", "void", "volatile", "while"]
  552. clikeNextToken(g, keywords, {hasPreprocessor})
  553. proc javaNextToken(g: var GeneralTokenizer) =
  554. const
  555. keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break",
  556. "byte", "case", "catch", "char", "class", "const", "continue", "default",
  557. "do", "double", "else", "enum", "extends", "false", "final", "finally",
  558. "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
  559. "interface", "long", "native", "new", "null", "package", "private",
  560. "protected", "public", "return", "short", "static", "strictfp", "super",
  561. "switch", "synchronized", "this", "throw", "throws", "transient", "true",
  562. "try", "void", "volatile", "while"]
  563. clikeNextToken(g, keywords, {})
  564. proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
  565. g.kind = gtStringLit
  566. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ',', ']', '}'}:
  567. if g.buf[pos] == ':' and
  568. g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
  569. break
  570. inc(pos)
  571. proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
  572. g.kind = gtNone
  573. if g.buf[pos] == '-': inc(pos)
  574. if g.buf[pos] == '0': inc(pos)
  575. elif g.buf[pos] in '1'..'9':
  576. inc(pos)
  577. while g.buf[pos] in {'0'..'9'}: inc(pos)
  578. else: yamlPlainStrLit(g, pos)
  579. if g.kind == gtNone:
  580. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
  581. g.kind = gtDecNumber
  582. elif g.buf[pos] == '.':
  583. inc(pos)
  584. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  585. else:
  586. while g.buf[pos] in {'0'..'9'}: inc(pos)
  587. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
  588. g.kind = gtFloatNumber
  589. if g.kind == gtNone:
  590. if g.buf[pos] in {'e', 'E'}:
  591. inc(pos)
  592. if g.buf[pos] in {'-', '+'}: inc(pos)
  593. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  594. else:
  595. while g.buf[pos] in {'0'..'9'}: inc(pos)
  596. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
  597. g.kind = gtFloatNumber
  598. else: yamlPlainStrLit(g, pos)
  599. else: yamlPlainStrLit(g, pos)
  600. while g.buf[pos] notin {'\0', ',', ']', '}', '\x0A', '\x0D'}:
  601. inc(pos)
  602. if g.buf[pos] notin {'\x09'..'\x0D', ' ', ',', ']', '}'}:
  603. yamlPlainStrLit(g, pos)
  604. break
  605. # theoretically, we would need to parse indentation (like with block scalars)
  606. # because of possible multiline flow scalars that start with number-like
  607. # content, but that is far too troublesome. I think it is fine that the
  608. # highlighter is sloppy here.
  609. proc yamlNextToken(g: var GeneralTokenizer) =
  610. const
  611. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  612. var pos = g.pos
  613. g.start = g.pos
  614. if g.state == gtStringLit:
  615. g.kind = gtStringLit
  616. while true:
  617. case g.buf[pos]
  618. of '\\':
  619. if pos != g.pos: break
  620. g.kind = gtEscapeSequence
  621. inc(pos)
  622. case g.buf[pos]
  623. of 'x':
  624. inc(pos)
  625. for i in 1..2:
  626. {.unroll.}
  627. if g.buf[pos] in hexChars: inc(pos)
  628. break
  629. of 'u':
  630. inc(pos)
  631. for i in 1..4:
  632. {.unroll.}
  633. if g.buf[pos] in hexChars: inc(pos)
  634. break
  635. of 'U':
  636. inc(pos)
  637. for i in 1..8:
  638. {.unroll.}
  639. if g.buf[pos] in hexChars: inc(pos)
  640. break
  641. else: inc(pos)
  642. break
  643. of '\0':
  644. g.state = gtOther
  645. break
  646. of '\"':
  647. inc(pos)
  648. g.state = gtOther
  649. break
  650. else: inc(pos)
  651. elif g.state == gtCharLit:
  652. # abusing gtCharLit as single-quoted string lit
  653. g.kind = gtStringLit
  654. inc(pos) # skip the starting '
  655. while true:
  656. case g.buf[pos]
  657. of '\'':
  658. inc(pos)
  659. if g.buf[pos] == '\'':
  660. inc(pos)
  661. g.kind = gtEscapeSequence
  662. else: g.state = gtOther
  663. break
  664. else: inc(pos)
  665. elif g.state == gtCommand:
  666. # gtCommand means 'block scalar header'
  667. case g.buf[pos]
  668. of ' ', '\t':
  669. g.kind = gtWhitespace
  670. while g.buf[pos] in {' ', '\t'}: inc(pos)
  671. of '#':
  672. g.kind = gtComment
  673. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  674. of '\x0A', '\x0D': discard
  675. else:
  676. # illegal here. just don't parse a block scalar
  677. g.kind = gtNone
  678. g.state = gtOther
  679. if g.buf[pos] in {'\x0A', '\x0D'} and g.state == gtCommand:
  680. g.state = gtLongStringLit
  681. elif g.state == gtLongStringLit:
  682. # beware, this is the only token where we actually have to parse
  683. # indentation.
  684. g.kind = gtLongStringLit
  685. # first, we have to find the parent indentation of the block scalar, so that
  686. # we know when to stop
  687. assert g.buf[pos] in {'\x0A', '\x0D'}
  688. var lookbehind = pos - 1
  689. var headerStart = -1
  690. while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
  691. if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
  692. headerStart = lookbehind
  693. dec(lookbehind)
  694. assert headerStart != -1
  695. var indentation = 1
  696. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  697. if g.buf[lookbehind + indentation] in {'|', '>'}:
  698. # when the header is alone in a line, this line does not show the parent's
  699. # indentation, so we must go further. search the first previous line with
  700. # non-whitespace content.
  701. while lookbehind >= 0 and g.buf[lookbehind] in {'\x0A', '\x0D'}:
  702. dec(lookbehind)
  703. while lookbehind >= 0 and
  704. g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
  705. # now, find the beginning of the line...
  706. while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
  707. dec(lookbehind)
  708. # ... and its indentation
  709. indentation = 1
  710. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  711. if lookbehind == -1: indentation = 0 # top level
  712. elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
  713. g.buf[lookbehind + 3] == '-' and
  714. g.buf[lookbehind + 4] in {'\x09'..'\x0D', ' '}:
  715. # this is a document start, therefore, we are at top level
  716. indentation = 0
  717. # because lookbehind was at newline char when calculating indentation, we're
  718. # off by one. fix that. top level's parent will have indentation of -1.
  719. let parentIndentation = indentation - 1
  720. # find first content
  721. while g.buf[pos] in {' ', '\x0A', '\x0D'}:
  722. if g.buf[pos] == ' ': inc(indentation)
  723. else: indentation = 0
  724. inc(pos)
  725. var minIndentation = indentation
  726. # for stupid edge cases, we must check whether an explicit indentation depth
  727. # is given at the header.
  728. while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart)
  729. if g.buf[headerStart] in {'0'..'9'}:
  730. minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0'))
  731. # process content lines
  732. while indentation > parentIndentation and g.buf[pos] != '\0':
  733. if (indentation < minIndentation and g.buf[pos] == '#') or
  734. (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
  735. g.buf[pos + 2] == '.' and
  736. g.buf[pos + 3] in {'\0', '\x09'..'\x0D', ' '}):
  737. # comment after end of block scalar, or end of document
  738. break
  739. minIndentation = min(indentation, minIndentation)
  740. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  741. while g.buf[pos] in {' ', '\x0A', '\x0D'}:
  742. if g.buf[pos] == ' ': inc(indentation)
  743. else: indentation = 0
  744. inc(pos)
  745. g.state = gtOther
  746. elif g.state == gtOther:
  747. # gtOther means 'inside YAML document'
  748. case g.buf[pos]
  749. of ' ', '\x09'..'\x0D':
  750. g.kind = gtWhitespace
  751. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  752. of '#':
  753. g.kind = gtComment
  754. inc(pos)
  755. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  756. of '-':
  757. inc(pos)
  758. if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
  759. g.kind = gtPunctuation
  760. elif g.buf[pos] == '-' and
  761. (pos == 1 or g.buf[pos - 2] in {'\x0A', '\x0D'}): # start of line
  762. inc(pos)
  763. if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
  764. inc(pos)
  765. g.kind = gtKeyword
  766. else: yamlPossibleNumber(g, pos)
  767. else: yamlPossibleNumber(g, pos)
  768. of '.':
  769. if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
  770. inc(pos)
  771. for i in 1..2:
  772. {.unroll.}
  773. if g.buf[pos] != '.': break
  774. inc(pos)
  775. if pos == g.start + 3:
  776. g.kind = gtKeyword
  777. g.state = gtNone
  778. else: yamlPlainStrLit(g, pos)
  779. else: yamlPlainStrLit(g, pos)
  780. of '?':
  781. inc(pos)
  782. if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
  783. g.kind = gtPunctuation
  784. else: yamlPlainStrLit(g, pos)
  785. of ':':
  786. inc(pos)
  787. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', '\'', '\"'} or
  788. (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
  789. g.kind = gtPunctuation
  790. else: yamlPlainStrLit(g, pos)
  791. of '[', ']', '{', '}', ',':
  792. inc(pos)
  793. g.kind = gtPunctuation
  794. of '\"':
  795. inc(pos)
  796. g.state = gtStringLit
  797. g.kind = gtStringLit
  798. of '\'':
  799. g.state = gtCharLit
  800. g.kind = gtNone
  801. of '!':
  802. g.kind = gtTagStart
  803. inc(pos)
  804. if g.buf[pos] == '<':
  805. # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
  806. while g.buf[pos] notin {'\0', '>', '\x09'..'\x0D', ' '}: inc(pos)
  807. if g.buf[pos] == '>': inc(pos)
  808. else:
  809. while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
  810. case g.buf[pos]
  811. of '!':
  812. # prefixed tag (e.g. `!!str`)
  813. inc(pos)
  814. while g.buf[pos] notin
  815. {'\0', '\x09'..'\x0D', ' ', ',', '[', ']', '{', '}'}: inc(pos)
  816. of '\0', '\x09'..'\x0D', ' ': discard
  817. else:
  818. # local tag (e.g. `!nim:system:int`)
  819. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
  820. of '&':
  821. g.kind = gtLabel
  822. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
  823. of '*':
  824. g.kind = gtReference
  825. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
  826. of '|', '>':
  827. # this can lead to incorrect tokenization when | or > appear inside flow
  828. # content. checking whether we're inside flow content is not
  829. # chomsky type-3, so we won't do that here.
  830. g.kind = gtCommand
  831. g.state = gtCommand
  832. inc(pos)
  833. while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos)
  834. of '0'..'9': yamlPossibleNumber(g, pos)
  835. of '\0': g.kind = gtEof
  836. else: yamlPlainStrLit(g, pos)
  837. else:
  838. # outside document
  839. case g.buf[pos]
  840. of '%':
  841. if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
  842. g.kind = gtDirective
  843. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  844. else:
  845. g.state = gtOther
  846. yamlPlainStrLit(g, pos)
  847. of ' ', '\x09'..'\x0D':
  848. g.kind = gtWhitespace
  849. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  850. of '#':
  851. g.kind = gtComment
  852. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  853. of '\0': g.kind = gtEof
  854. else:
  855. g.kind = gtNone
  856. g.state = gtOther
  857. g.length = pos - g.pos
  858. g.pos = pos
  859. proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  860. case lang
  861. of langNone: assert false
  862. of langNim: nimNextToken(g)
  863. of langCpp: cppNextToken(g)
  864. of langCsharp: csharpNextToken(g)
  865. of langC: cNextToken(g)
  866. of langJava: javaNextToken(g)
  867. of langYaml: yamlNextToken(g)
  868. when isMainModule:
  869. var keywords: seq[string]
  870. # Try to work running in both the subdir or at the root.
  871. for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
  872. try:
  873. let input = string(readFile(filename))
  874. keywords = input.splitWhitespace()
  875. break
  876. except:
  877. echo filename, " not found"
  878. doAssert(keywords.len > 0, "Couldn't read any keywords.txt file!")
  879. for i in 0..min(keywords.len, nimKeywords.len)-1:
  880. doAssert keywords[i] == nimKeywords[i], "Unexpected keyword"
  881. doAssert keywords.len == nimKeywords.len, "No matching lengths"