highlite.nim 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Source highlighter for programming or markup languages.
  10. ## Currently only few languages are supported, other languages may be added.
  11. ## The interface supports one language nested in another.
  12. import
  13. strutils
  14. from algorithm import binarySearch
  15. type
  16. TokenClass* = enum
  17. gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
  18. gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
  19. gtLongStringLit, gtCharLit, gtEscapeSequence, # escape sequence like \xff
  20. gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
  21. gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
  22. gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
  23. gtReference, gtOther
  24. GeneralTokenizer* = object of RootObj
  25. kind*: TokenClass
  26. start*, length*: int
  27. buf: cstring
  28. pos: int
  29. state: TokenClass
  30. SourceLanguage* = enum
  31. langNone, langNim, langCpp, langCsharp, langC, langJava,
  32. langYaml
  33. const
  34. sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
  35. "Nim", "C++", "C#", "C", "Java", "Yaml"]
  36. tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
  37. "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
  38. "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
  39. "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
  40. "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
  41. "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
  42. "Label", "Reference", "Other"]
  43. # The following list comes from doc/keywords.txt, make sure it is
  44. # synchronized with this array by running the module itself as a test case.
  45. nimKeywords = ["addr", "and", "as", "asm", "bind", "block",
  46. "break", "case", "cast", "concept", "const", "continue", "converter",
  47. "defer", "discard", "distinct", "div", "do",
  48. "elif", "else", "end", "enum", "except", "export",
  49. "finally", "for", "from", "func",
  50. "if", "import", "in", "include",
  51. "interface", "is", "isnot", "iterator", "let", "macro", "method",
  52. "mixin", "mod", "nil", "not", "notin", "object", "of", "or", "out", "proc",
  53. "ptr", "raise", "ref", "return", "shl", "shr", "static",
  54. "template", "try", "tuple", "type", "using", "var", "when", "while",
  55. "xor", "yield"]
  56. proc getSourceLanguage*(name: string): SourceLanguage =
  57. for i in countup(succ(low(SourceLanguage)), high(SourceLanguage)):
  58. if cmpIgnoreStyle(name, sourceLanguageToStr[i]) == 0:
  59. return i
  60. result = langNone
  61. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: cstring) =
  62. g.buf = buf
  63. g.kind = low(TokenClass)
  64. g.start = 0
  65. g.length = 0
  66. g.state = low(TokenClass)
  67. var pos = 0 # skip initial whitespace:
  68. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  69. g.pos = pos
  70. proc initGeneralTokenizer*(g: var GeneralTokenizer, buf: string) =
  71. initGeneralTokenizer(g, cstring(buf))
  72. proc deinitGeneralTokenizer*(g: var GeneralTokenizer) =
  73. discard
  74. proc nimGetKeyword(id: string): TokenClass =
  75. for k in nimKeywords:
  76. if cmpIgnoreStyle(id, k) == 0: return gtKeyword
  77. result = gtIdentifier
  78. when false:
  79. var i = getIdent(id)
  80. if (i.id >= ord(tokKeywordLow) - ord(tkSymbol)) and
  81. (i.id <= ord(tokKeywordHigh) - ord(tkSymbol)):
  82. result = gtKeyword
  83. else:
  84. result = gtIdentifier
  85. proc nimNumberPostfix(g: var GeneralTokenizer, position: int): int =
  86. var pos = position
  87. if g.buf[pos] == '\'':
  88. inc(pos)
  89. case g.buf[pos]
  90. of 'f', 'F':
  91. g.kind = gtFloatNumber
  92. inc(pos)
  93. if g.buf[pos] in {'0'..'9'}: inc(pos)
  94. if g.buf[pos] in {'0'..'9'}: inc(pos)
  95. of 'i', 'I':
  96. inc(pos)
  97. if g.buf[pos] in {'0'..'9'}: inc(pos)
  98. if g.buf[pos] in {'0'..'9'}: inc(pos)
  99. else:
  100. discard
  101. result = pos
  102. proc nimNumber(g: var GeneralTokenizer, position: int): int =
  103. const decChars = {'0'..'9', '_'}
  104. var pos = position
  105. g.kind = gtDecNumber
  106. while g.buf[pos] in decChars: inc(pos)
  107. if g.buf[pos] == '.':
  108. g.kind = gtFloatNumber
  109. inc(pos)
  110. while g.buf[pos] in decChars: inc(pos)
  111. if g.buf[pos] in {'e', 'E'}:
  112. g.kind = gtFloatNumber
  113. inc(pos)
  114. if g.buf[pos] in {'+', '-'}: inc(pos)
  115. while g.buf[pos] in decChars: inc(pos)
  116. result = nimNumberPostfix(g, pos)
  117. const
  118. OpChars = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
  119. '|', '=', '%', '&', '$', '@', '~', ':'}
  120. proc nimNextToken(g: var GeneralTokenizer) =
  121. const
  122. hexChars = {'0'..'9', 'A'..'F', 'a'..'f', '_'}
  123. octChars = {'0'..'7', '_'}
  124. binChars = {'0'..'1', '_'}
  125. SymChars = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
  126. var pos = g.pos
  127. g.start = g.pos
  128. if g.state == gtStringLit:
  129. g.kind = gtStringLit
  130. while true:
  131. case g.buf[pos]
  132. of '\\':
  133. g.kind = gtEscapeSequence
  134. inc(pos)
  135. case g.buf[pos]
  136. of 'x', 'X':
  137. inc(pos)
  138. if g.buf[pos] in hexChars: inc(pos)
  139. if g.buf[pos] in hexChars: inc(pos)
  140. of '0'..'9':
  141. while g.buf[pos] in {'0'..'9'}: inc(pos)
  142. of '\0':
  143. g.state = gtNone
  144. else: inc(pos)
  145. break
  146. of '\0', '\x0D', '\x0A':
  147. g.state = gtNone
  148. break
  149. of '\"':
  150. inc(pos)
  151. g.state = gtNone
  152. break
  153. else: inc(pos)
  154. else:
  155. case g.buf[pos]
  156. of ' ', '\x09'..'\x0D':
  157. g.kind = gtWhitespace
  158. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  159. of '#':
  160. g.kind = gtComment
  161. inc(pos)
  162. var isDoc = false
  163. if g.buf[pos] == '#':
  164. inc(pos)
  165. isDoc = true
  166. if g.buf[pos] == '[':
  167. g.kind = gtLongComment
  168. var nesting = 0
  169. while true:
  170. case g.buf[pos]
  171. of '\0': break
  172. of '#':
  173. if isDoc:
  174. if g.buf[pos+1] == '#' and g.buf[pos+2] == '[':
  175. inc nesting
  176. elif g.buf[pos+1] == '[':
  177. inc nesting
  178. inc pos
  179. of ']':
  180. if isDoc:
  181. if g.buf[pos+1] == '#' and g.buf[pos+2] == '#':
  182. if nesting == 0:
  183. inc(pos, 3)
  184. break
  185. dec nesting
  186. elif g.buf[pos+1] == '#':
  187. if nesting == 0:
  188. inc(pos, 2)
  189. break
  190. dec nesting
  191. inc pos
  192. else:
  193. inc pos
  194. else:
  195. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  196. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  197. var id = ""
  198. while g.buf[pos] in SymChars + {'_'}:
  199. add(id, g.buf[pos])
  200. inc(pos)
  201. if (g.buf[pos] == '\"'):
  202. if (g.buf[pos + 1] == '\"') and (g.buf[pos + 2] == '\"'):
  203. inc(pos, 3)
  204. g.kind = gtLongStringLit
  205. while true:
  206. case g.buf[pos]
  207. of '\0':
  208. break
  209. of '\"':
  210. inc(pos)
  211. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  212. g.buf[pos+2] != '\"':
  213. inc(pos, 2)
  214. break
  215. else: inc(pos)
  216. else:
  217. g.kind = gtRawData
  218. inc(pos)
  219. while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}):
  220. if g.buf[pos] == '"' and g.buf[pos+1] != '"': break
  221. inc(pos)
  222. if g.buf[pos] == '\"': inc(pos)
  223. else:
  224. g.kind = nimGetKeyword(id)
  225. of '0':
  226. inc(pos)
  227. case g.buf[pos]
  228. of 'b', 'B':
  229. g.kind = gtBinNumber
  230. inc(pos)
  231. while g.buf[pos] in binChars: inc(pos)
  232. pos = nimNumberPostfix(g, pos)
  233. of 'x', 'X':
  234. g.kind = gtHexNumber
  235. inc(pos)
  236. while g.buf[pos] in hexChars: inc(pos)
  237. pos = nimNumberPostfix(g, pos)
  238. of 'o', 'O':
  239. g.kind = gtOctNumber
  240. inc(pos)
  241. while g.buf[pos] in octChars: inc(pos)
  242. pos = nimNumberPostfix(g, pos)
  243. else: pos = nimNumber(g, pos)
  244. of '1'..'9':
  245. pos = nimNumber(g, pos)
  246. of '\'':
  247. inc(pos)
  248. g.kind = gtCharLit
  249. while true:
  250. case g.buf[pos]
  251. of '\0', '\x0D', '\x0A':
  252. break
  253. of '\'':
  254. inc(pos)
  255. break
  256. of '\\':
  257. inc(pos, 2)
  258. else: inc(pos)
  259. of '\"':
  260. inc(pos)
  261. if (g.buf[pos] == '\"') and (g.buf[pos + 1] == '\"'):
  262. inc(pos, 2)
  263. g.kind = gtLongStringLit
  264. while true:
  265. case g.buf[pos]
  266. of '\0':
  267. break
  268. of '\"':
  269. inc(pos)
  270. if g.buf[pos] == '\"' and g.buf[pos+1] == '\"' and
  271. g.buf[pos+2] != '\"':
  272. inc(pos, 2)
  273. break
  274. else: inc(pos)
  275. else:
  276. g.kind = gtStringLit
  277. while true:
  278. case g.buf[pos]
  279. of '\0', '\x0D', '\x0A':
  280. break
  281. of '\"':
  282. inc(pos)
  283. break
  284. of '\\':
  285. g.state = g.kind
  286. break
  287. else: inc(pos)
  288. of '(', ')', '[', ']', '{', '}', '`', ':', ',', ';':
  289. inc(pos)
  290. g.kind = gtPunctuation
  291. of '\0':
  292. g.kind = gtEof
  293. else:
  294. if g.buf[pos] in OpChars:
  295. g.kind = gtOperator
  296. while g.buf[pos] in OpChars: inc(pos)
  297. else:
  298. inc(pos)
  299. g.kind = gtNone
  300. g.length = pos - g.pos
  301. if g.kind != gtEof and g.length <= 0:
  302. assert false, "nimNextToken: produced an empty token"
  303. g.pos = pos
  304. proc generalNumber(g: var GeneralTokenizer, position: int): int =
  305. const decChars = {'0'..'9'}
  306. var pos = position
  307. g.kind = gtDecNumber
  308. while g.buf[pos] in decChars: inc(pos)
  309. if g.buf[pos] == '.':
  310. g.kind = gtFloatNumber
  311. inc(pos)
  312. while g.buf[pos] in decChars: inc(pos)
  313. if g.buf[pos] in {'e', 'E'}:
  314. g.kind = gtFloatNumber
  315. inc(pos)
  316. if g.buf[pos] in {'+', '-'}: inc(pos)
  317. while g.buf[pos] in decChars: inc(pos)
  318. result = pos
  319. proc generalStrLit(g: var GeneralTokenizer, position: int): int =
  320. const
  321. decChars = {'0'..'9'}
  322. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  323. var pos = position
  324. g.kind = gtStringLit
  325. var c = g.buf[pos]
  326. inc(pos) # skip " or '
  327. while true:
  328. case g.buf[pos]
  329. of '\0':
  330. break
  331. of '\\':
  332. inc(pos)
  333. case g.buf[pos]
  334. of '\0':
  335. break
  336. of '0'..'9':
  337. while g.buf[pos] in decChars: inc(pos)
  338. of 'x', 'X':
  339. inc(pos)
  340. if g.buf[pos] in hexChars: inc(pos)
  341. if g.buf[pos] in hexChars: inc(pos)
  342. else: inc(pos, 2)
  343. else:
  344. if g.buf[pos] == c:
  345. inc(pos)
  346. break
  347. else:
  348. inc(pos)
  349. result = pos
  350. proc isKeyword(x: openArray[string], y: string): int =
  351. binarySearch(x, y)
  352. proc isKeywordIgnoreCase(x: openArray[string], y: string): int =
  353. binarySearch(x, y, cmpIgnoreCase)
  354. type
  355. TokenizerFlag = enum
  356. hasPreprocessor, hasNestedComments
  357. TokenizerFlags = set[TokenizerFlag]
  358. proc clikeNextToken(g: var GeneralTokenizer, keywords: openArray[string],
  359. flags: TokenizerFlags) =
  360. const
  361. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  362. octChars = {'0'..'7'}
  363. binChars = {'0'..'1'}
  364. symChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '\x80'..'\xFF'}
  365. var pos = g.pos
  366. g.start = g.pos
  367. if g.state == gtStringLit:
  368. g.kind = gtStringLit
  369. while true:
  370. case g.buf[pos]
  371. of '\\':
  372. g.kind = gtEscapeSequence
  373. inc(pos)
  374. case g.buf[pos]
  375. of 'x', 'X':
  376. inc(pos)
  377. if g.buf[pos] in hexChars: inc(pos)
  378. if g.buf[pos] in hexChars: inc(pos)
  379. of '0'..'9':
  380. while g.buf[pos] in {'0'..'9'}: inc(pos)
  381. of '\0':
  382. g.state = gtNone
  383. else: inc(pos)
  384. break
  385. of '\0', '\x0D', '\x0A':
  386. g.state = gtNone
  387. break
  388. of '\"':
  389. inc(pos)
  390. g.state = gtNone
  391. break
  392. else: inc(pos)
  393. else:
  394. case g.buf[pos]
  395. of ' ', '\x09'..'\x0D':
  396. g.kind = gtWhitespace
  397. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  398. of '/':
  399. inc(pos)
  400. if g.buf[pos] == '/':
  401. g.kind = gtComment
  402. while not (g.buf[pos] in {'\0', '\x0A', '\x0D'}): inc(pos)
  403. elif g.buf[pos] == '*':
  404. g.kind = gtLongComment
  405. var nested = 0
  406. inc(pos)
  407. while true:
  408. case g.buf[pos]
  409. of '*':
  410. inc(pos)
  411. if g.buf[pos] == '/':
  412. inc(pos)
  413. if nested == 0: break
  414. of '/':
  415. inc(pos)
  416. if g.buf[pos] == '*':
  417. inc(pos)
  418. if hasNestedComments in flags: inc(nested)
  419. of '\0':
  420. break
  421. else: inc(pos)
  422. of '#':
  423. inc(pos)
  424. if hasPreprocessor in flags:
  425. g.kind = gtPreprocessor
  426. while g.buf[pos] in {' ', '\t'}: inc(pos)
  427. while g.buf[pos] in symChars: inc(pos)
  428. else:
  429. g.kind = gtOperator
  430. of 'a'..'z', 'A'..'Z', '_', '\x80'..'\xFF':
  431. var id = ""
  432. while g.buf[pos] in symChars:
  433. add(id, g.buf[pos])
  434. inc(pos)
  435. if isKeyword(keywords, id) >= 0: g.kind = gtKeyword
  436. else: g.kind = gtIdentifier
  437. of '0':
  438. inc(pos)
  439. case g.buf[pos]
  440. of 'b', 'B':
  441. inc(pos)
  442. while g.buf[pos] in binChars: inc(pos)
  443. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  444. of 'x', 'X':
  445. inc(pos)
  446. while g.buf[pos] in hexChars: inc(pos)
  447. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  448. of '0'..'7':
  449. inc(pos)
  450. while g.buf[pos] in octChars: inc(pos)
  451. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  452. else:
  453. pos = generalNumber(g, pos)
  454. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  455. of '1'..'9':
  456. pos = generalNumber(g, pos)
  457. if g.buf[pos] in {'A'..'Z', 'a'..'z'}: inc(pos)
  458. of '\'':
  459. pos = generalStrLit(g, pos)
  460. g.kind = gtCharLit
  461. of '\"':
  462. inc(pos)
  463. g.kind = gtStringLit
  464. while true:
  465. case g.buf[pos]
  466. of '\0':
  467. break
  468. of '\"':
  469. inc(pos)
  470. break
  471. of '\\':
  472. g.state = g.kind
  473. break
  474. else: inc(pos)
  475. of '(', ')', '[', ']', '{', '}', ':', ',', ';', '.':
  476. inc(pos)
  477. g.kind = gtPunctuation
  478. of '\0':
  479. g.kind = gtEof
  480. else:
  481. if g.buf[pos] in OpChars:
  482. g.kind = gtOperator
  483. while g.buf[pos] in OpChars: inc(pos)
  484. else:
  485. inc(pos)
  486. g.kind = gtNone
  487. g.length = pos - g.pos
  488. if g.kind != gtEof and g.length <= 0:
  489. assert false, "clikeNextToken: produced an empty token"
  490. g.pos = pos
  491. proc cNextToken(g: var GeneralTokenizer) =
  492. const
  493. keywords: array[0..36, string] = ["_Bool", "_Complex", "_Imaginary", "auto",
  494. "break", "case", "char", "const", "continue", "default", "do", "double",
  495. "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int",
  496. "long", "register", "restrict", "return", "short", "signed", "sizeof",
  497. "static", "struct", "switch", "typedef", "union", "unsigned", "void",
  498. "volatile", "while"]
  499. clikeNextToken(g, keywords, {hasPreprocessor})
  500. proc cppNextToken(g: var GeneralTokenizer) =
  501. const
  502. keywords: array[0..47, string] = ["asm", "auto", "break", "case", "catch",
  503. "char", "class", "const", "continue", "default", "delete", "do", "double",
  504. "else", "enum", "extern", "float", "for", "friend", "goto", "if",
  505. "inline", "int", "long", "new", "operator", "private", "protected",
  506. "public", "register", "return", "short", "signed", "sizeof", "static",
  507. "struct", "switch", "template", "this", "throw", "try", "typedef",
  508. "union", "unsigned", "virtual", "void", "volatile", "while"]
  509. clikeNextToken(g, keywords, {hasPreprocessor})
  510. proc csharpNextToken(g: var GeneralTokenizer) =
  511. const
  512. keywords: array[0..76, string] = ["abstract", "as", "base", "bool", "break",
  513. "byte", "case", "catch", "char", "checked", "class", "const", "continue",
  514. "decimal", "default", "delegate", "do", "double", "else", "enum", "event",
  515. "explicit", "extern", "false", "finally", "fixed", "float", "for",
  516. "foreach", "goto", "if", "implicit", "in", "int", "interface", "internal",
  517. "is", "lock", "long", "namespace", "new", "null", "object", "operator",
  518. "out", "override", "params", "private", "protected", "public", "readonly",
  519. "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc",
  520. "static", "string", "struct", "switch", "this", "throw", "true", "try",
  521. "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using",
  522. "virtual", "void", "volatile", "while"]
  523. clikeNextToken(g, keywords, {hasPreprocessor})
  524. proc javaNextToken(g: var GeneralTokenizer) =
  525. const
  526. keywords: array[0..52, string] = ["abstract", "assert", "boolean", "break",
  527. "byte", "case", "catch", "char", "class", "const", "continue", "default",
  528. "do", "double", "else", "enum", "extends", "false", "final", "finally",
  529. "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
  530. "interface", "long", "native", "new", "null", "package", "private",
  531. "protected", "public", "return", "short", "static", "strictfp", "super",
  532. "switch", "synchronized", "this", "throw", "throws", "transient", "true",
  533. "try", "void", "volatile", "while"]
  534. clikeNextToken(g, keywords, {})
  535. proc yamlPlainStrLit(g: var GeneralTokenizer, pos: var int) =
  536. g.kind = gtStringLit
  537. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ',', ']', '}'}:
  538. if g.buf[pos] == ':' and
  539. g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
  540. break
  541. inc(pos)
  542. proc yamlPossibleNumber(g: var GeneralTokenizer, pos: var int) =
  543. g.kind = gtNone
  544. if g.buf[pos] == '-': inc(pos)
  545. if g.buf[pos] == '0': inc(pos)
  546. elif g.buf[pos] in '1'..'9':
  547. inc(pos)
  548. while g.buf[pos] in {'0'..'9'}: inc(pos)
  549. else: yamlPlainStrLit(g, pos)
  550. if g.kind == gtNone:
  551. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
  552. g.kind = gtDecNumber
  553. elif g.buf[pos] == '.':
  554. inc(pos)
  555. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  556. else:
  557. while g.buf[pos] in {'0'..'9'}: inc(pos)
  558. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
  559. g.kind = gtFloatNumber
  560. if g.kind == gtNone:
  561. if g.buf[pos] in {'e', 'E'}:
  562. inc(pos)
  563. if g.buf[pos] in {'-', '+'}: inc(pos)
  564. if g.buf[pos] notin {'0'..'9'}: yamlPlainStrLit(g, pos)
  565. else:
  566. while g.buf[pos] in {'0'..'9'}: inc(pos)
  567. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', ',', ']', '}'}:
  568. g.kind = gtFloatNumber
  569. else: yamlPlainStrLit(g, pos)
  570. else: yamlPlainStrLit(g, pos)
  571. while g.buf[pos] notin {'\0', ',', ']', '}', '\x0A', '\x0D'}:
  572. inc(pos)
  573. if g.buf[pos] notin {'\x09'..'\x0D', ' ', ',', ']', '}'}:
  574. yamlPlainStrLit(g, pos)
  575. break
  576. # theoretically, we would need to parse indentation (like with block scalars)
  577. # because of possible multiline flow scalars that start with number-like
  578. # content, but that is far too troublesome. I think it is fine that the
  579. # highlighter is sloppy here.
  580. proc yamlNextToken(g: var GeneralTokenizer) =
  581. const
  582. hexChars = {'0'..'9', 'A'..'F', 'a'..'f'}
  583. var pos = g.pos
  584. g.start = g.pos
  585. if g.state == gtStringLit:
  586. g.kind = gtStringLit
  587. while true:
  588. case g.buf[pos]
  589. of '\\':
  590. if pos != g.pos: break
  591. g.kind = gtEscapeSequence
  592. inc(pos)
  593. case g.buf[pos]
  594. of 'x':
  595. inc(pos)
  596. for i in 1..2:
  597. {.unroll.}
  598. if g.buf[pos] in hexChars: inc(pos)
  599. break
  600. of 'u':
  601. inc(pos)
  602. for i in 1..4:
  603. {.unroll.}
  604. if g.buf[pos] in hexChars: inc(pos)
  605. break
  606. of 'U':
  607. inc(pos)
  608. for i in 1..8:
  609. {.unroll.}
  610. if g.buf[pos] in hexChars: inc(pos)
  611. break
  612. else: inc(pos)
  613. break
  614. of '\0':
  615. g.state = gtOther
  616. break
  617. of '\"':
  618. inc(pos)
  619. g.state = gtOther
  620. break
  621. else: inc(pos)
  622. elif g.state == gtCharLit:
  623. # abusing gtCharLit as single-quoted string lit
  624. g.kind = gtStringLit
  625. inc(pos) # skip the starting '
  626. while true:
  627. case g.buf[pos]
  628. of '\'':
  629. inc(pos)
  630. if g.buf[pos] == '\'':
  631. inc(pos)
  632. g.kind = gtEscapeSequence
  633. else: g.state = gtOther
  634. break
  635. else: inc(pos)
  636. elif g.state == gtCommand:
  637. # gtCommand means 'block scalar header'
  638. case g.buf[pos]
  639. of ' ', '\t':
  640. g.kind = gtWhitespace
  641. while g.buf[pos] in {' ', '\t'}: inc(pos)
  642. of '#':
  643. g.kind = gtComment
  644. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  645. of '\x0A', '\x0D': discard
  646. else:
  647. # illegal here. just don't parse a block scalar
  648. g.kind = gtNone
  649. g.state = gtOther
  650. if g.buf[pos] in {'\x0A', '\x0D'} and g.state == gtCommand:
  651. g.state = gtLongStringLit
  652. elif g.state == gtLongStringLit:
  653. # beware, this is the only token where we actually have to parse
  654. # indentation.
  655. g.kind = gtLongStringLit
  656. # first, we have to find the parent indentation of the block scalar, so that
  657. # we know when to stop
  658. assert g.buf[pos] in {'\x0A', '\x0D'}
  659. var lookbehind = pos - 1
  660. var headerStart = -1
  661. while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
  662. if headerStart == -1 and g.buf[lookbehind] in {'|', '>'}:
  663. headerStart = lookbehind
  664. dec(lookbehind)
  665. assert headerStart != -1
  666. var indentation = 1
  667. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  668. if g.buf[lookbehind + indentation] in {'|', '>'}:
  669. # when the header is alone in a line, this line does not show the parent's
  670. # indentation, so we must go further. search the first previous line with
  671. # non-whitespace content.
  672. while lookbehind >= 0 and g.buf[lookbehind] in {'\x0A', '\x0D'}:
  673. dec(lookbehind)
  674. while lookbehind >= 0 and
  675. g.buf[lookbehind] in {' ', '\t'}: dec(lookbehind)
  676. # now, find the beginning of the line...
  677. while lookbehind >= 0 and g.buf[lookbehind] notin {'\x0A', '\x0D'}:
  678. dec(lookbehind)
  679. # ... and its indentation
  680. indentation = 1
  681. while g.buf[lookbehind + indentation] == ' ': inc(indentation)
  682. if lookbehind == -1: indentation = 0 # top level
  683. elif g.buf[lookbehind + 1] == '-' and g.buf[lookbehind + 2] == '-' and
  684. g.buf[lookbehind + 3] == '-' and
  685. g.buf[lookbehind + 4] in {'\x09'..'\x0D', ' '}:
  686. # this is a document start, therefore, we are at top level
  687. indentation = 0
  688. # because lookbehind was at newline char when calculating indentation, we're
  689. # off by one. fix that. top level's parent will have indentation of -1.
  690. let parentIndentation = indentation - 1
  691. # find first content
  692. while g.buf[pos] in {' ', '\x0A', '\x0D'}:
  693. if g.buf[pos] == ' ': inc(indentation)
  694. else: indentation = 0
  695. inc(pos)
  696. var minIndentation = indentation
  697. # for stupid edge cases, we must check whether an explicit indentation depth
  698. # is given at the header.
  699. while g.buf[headerStart] in {'>', '|', '+', '-'}: inc(headerStart)
  700. if g.buf[headerStart] in {'0'..'9'}:
  701. minIndentation = min(minIndentation, ord(g.buf[headerStart]) - ord('0'))
  702. # process content lines
  703. while indentation > parentIndentation and g.buf[pos] != '\0':
  704. if (indentation < minIndentation and g.buf[pos] == '#') or
  705. (indentation == 0 and g.buf[pos] == '.' and g.buf[pos + 1] == '.' and
  706. g.buf[pos + 2] == '.' and
  707. g.buf[pos + 3] in {'\0', '\x09'..'\x0D', ' '}):
  708. # comment after end of block scalar, or end of document
  709. break
  710. minIndentation = min(indentation, minIndentation)
  711. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  712. while g.buf[pos] in {' ', '\x0A', '\x0D'}:
  713. if g.buf[pos] == ' ': inc(indentation)
  714. else: indentation = 0
  715. inc(pos)
  716. g.state = gtOther
  717. elif g.state == gtOther:
  718. # gtOther means 'inside YAML document'
  719. case g.buf[pos]
  720. of ' ', '\x09'..'\x0D':
  721. g.kind = gtWhitespace
  722. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  723. of '#':
  724. g.kind = gtComment
  725. inc(pos)
  726. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  727. of '-':
  728. inc(pos)
  729. if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
  730. g.kind = gtPunctuation
  731. elif g.buf[pos] == '-' and
  732. (pos == 1 or g.buf[pos - 2] in {'\x0A', '\x0D'}): # start of line
  733. inc(pos)
  734. if g.buf[pos] == '-' and g.buf[pos + 1] in {'\0', '\x09'..'\x0D', ' '}:
  735. inc(pos)
  736. g.kind = gtKeyword
  737. else: yamlPossibleNumber(g, pos)
  738. else: yamlPossibleNumber(g, pos)
  739. of '.':
  740. if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
  741. inc(pos)
  742. for i in 1..2:
  743. {.unroll.}
  744. if g.buf[pos] != '.': break
  745. inc(pos)
  746. if pos == g.start + 3:
  747. g.kind = gtKeyword
  748. g.state = gtNone
  749. else: yamlPlainStrLit(g, pos)
  750. else: yamlPlainStrLit(g, pos)
  751. of '?':
  752. inc(pos)
  753. if g.buf[pos] in {'\0', ' ', '\x09'..'\x0D'}:
  754. g.kind = gtPunctuation
  755. else: yamlPlainStrLit(g, pos)
  756. of ':':
  757. inc(pos)
  758. if g.buf[pos] in {'\0', '\x09'..'\x0D', ' ', '\'', '\"'} or
  759. (pos > 0 and g.buf[pos - 2] in {'}', ']', '\"', '\''}):
  760. g.kind = gtPunctuation
  761. else: yamlPlainStrLit(g, pos)
  762. of '[', ']', '{', '}', ',':
  763. inc(pos)
  764. g.kind = gtPunctuation
  765. of '\"':
  766. inc(pos)
  767. g.state = gtStringLit
  768. g.kind = gtStringLit
  769. of '\'':
  770. g.state = gtCharLit
  771. g.kind = gtNone
  772. of '!':
  773. g.kind = gtTagStart
  774. inc(pos)
  775. if g.buf[pos] == '<':
  776. # literal tag (e.g. `!<tag:yaml.org,2002:str>`)
  777. while g.buf[pos] notin {'\0', '>', '\x09'..'\x0D', ' '}: inc(pos)
  778. if g.buf[pos] == '>': inc(pos)
  779. else:
  780. while g.buf[pos] in {'A'..'Z', 'a'..'z', '0'..'9', '-'}: inc(pos)
  781. case g.buf[pos]
  782. of '!':
  783. # prefixed tag (e.g. `!!str`)
  784. inc(pos)
  785. while g.buf[pos] notin
  786. {'\0', '\x09'..'\x0D', ' ', ',', '[', ']', '{', '}'}: inc(pos)
  787. of '\0', '\x09'..'\x0D', ' ': discard
  788. else:
  789. # local tag (e.g. `!nim:system:int`)
  790. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
  791. of '&':
  792. g.kind = gtLabel
  793. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
  794. of '*':
  795. g.kind = gtReference
  796. while g.buf[pos] notin {'\0', '\x09'..'\x0D', ' '}: inc(pos)
  797. of '|', '>':
  798. # this can lead to incorrect tokenization when | or > appear inside flow
  799. # content. checking whether we're inside flow content is not
  800. # chomsky type-3, so we won't do that here.
  801. g.kind = gtCommand
  802. g.state = gtCommand
  803. inc(pos)
  804. while g.buf[pos] in {'0'..'9', '+', '-'}: inc(pos)
  805. of '0'..'9': yamlPossibleNumber(g, pos)
  806. of '\0': g.kind = gtEOF
  807. else: yamlPlainStrLit(g, pos)
  808. else:
  809. # outside document
  810. case g.buf[pos]
  811. of '%':
  812. if pos == 0 or g.buf[pos - 1] in {'\x0A', '\x0D'}:
  813. g.kind = gtDirective
  814. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  815. else:
  816. g.state = gtOther
  817. yamlPlainStrLit(g, pos)
  818. of ' ', '\x09'..'\x0D':
  819. g.kind = gtWhitespace
  820. while g.buf[pos] in {' ', '\x09'..'\x0D'}: inc(pos)
  821. of '#':
  822. g.kind = gtComment
  823. while g.buf[pos] notin {'\0', '\x0A', '\x0D'}: inc(pos)
  824. of '\0': g.kind = gtEOF
  825. else:
  826. g.kind = gtNone
  827. g.state = gtOther
  828. g.length = pos - g.pos
  829. g.pos = pos
  830. proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  831. case lang
  832. of langNone: assert false
  833. of langNim: nimNextToken(g)
  834. of langCpp: cppNextToken(g)
  835. of langCsharp: csharpNextToken(g)
  836. of langC: cNextToken(g)
  837. of langJava: javaNextToken(g)
  838. of langYaml: yamlNextToken(g)
  839. when isMainModule:
  840. var keywords: seq[string]
  841. # Try to work running in both the subdir or at the root.
  842. for filename in ["doc/keywords.txt", "../../../doc/keywords.txt"]:
  843. try:
  844. let input = string(readFile(filename))
  845. keywords = input.splitWhitespace()
  846. break
  847. except:
  848. echo filename, " not found"
  849. doAssert(keywords.len > 0, "Couldn't read any keywords.txt file!")
  850. for i in 0..min(keywords.len, nimKeywords.len)-1:
  851. doAssert keywords[i] == nimKeywords[i], "Unexpected keyword"
  852. doAssert keywords.len == nimKeywords.len, "No matching lengths"