pegs.nim 68 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but
  10. ## uses superoperators and symbol inlining to improve performance. Note:
  11. ## Matching performance is hopefully competitive with optimized regular
  12. ## expression engines.
  13. ##
  14. ## .. include:: ../../doc/pegdocs.txt
  15. ##
  16. include "system/inclrtl"
  17. const
  18. useUnicode = true ## change this to deactivate proper UTF-8 support
  19. import strutils, macros
  20. when useUnicode:
  21. import unicode
  22. export unicode.`==`
  23. const
  24. InlineThreshold = 5 ## number of leaves; -1 to disable inlining
  25. MaxSubpatterns* = 20 ## defines the maximum number of subpatterns that
  26. ## can be captured. More subpatterns cannot be captured!
  27. type
  28. PegKind* = enum
  29. pkEmpty,
  30. pkAny, ## any character (.)
  31. pkAnyRune, ## any Unicode character (_)
  32. pkNewLine, ## CR-LF, LF, CR
  33. pkLetter, ## Unicode letter
  34. pkLower, ## Unicode lower case letter
  35. pkUpper, ## Unicode upper case letter
  36. pkTitle, ## Unicode title character
  37. pkWhitespace, ## Unicode whitespace character
  38. pkTerminal,
  39. pkTerminalIgnoreCase,
  40. pkTerminalIgnoreStyle,
  41. pkChar, ## single character to match
  42. pkCharChoice,
  43. pkNonTerminal,
  44. pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c)
  45. pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c]
  46. pkGreedyRep, ## a* --> Internal DSL: *a
  47. ## a+ --> (a a*)
  48. pkGreedyRepChar, ## x* where x is a single character (superop)
  49. pkGreedyRepSet, ## [set]* (superop)
  50. pkGreedyAny, ## .* or _* (superop)
  51. pkOption, ## a? --> Internal DSL: ?a
  52. pkAndPredicate, ## &a --> Internal DSL: &a
  53. pkNotPredicate, ## !a --> Internal DSL: !a
  54. pkCapture, ## {a} --> Internal DSL: capture(a)
  55. pkBackRef, ## $i --> Internal DSL: backref(i)
  56. pkBackRefIgnoreCase,
  57. pkBackRefIgnoreStyle,
  58. pkSearch, ## @a --> Internal DSL: !*a
  59. pkCapturedSearch, ## {@} a --> Internal DSL: !*\a
  60. pkRule, ## a <- b
  61. pkList, ## a, b
  62. pkStartAnchor ## ^ --> Internal DSL: startAnchor()
  63. NonTerminalFlag* = enum
  64. ntDeclared, ntUsed
  65. NonTerminalObj = object ## represents a non terminal symbol
  66. name: string ## the name of the symbol
  67. line: int ## line the symbol has been declared/used in
  68. col: int ## column the symbol has been declared/used in
  69. flags: set[NonTerminalFlag] ## the nonterminal's flags
  70. rule: Peg ## the rule that the symbol refers to
  71. Peg* {.shallow.} = object ## type that represents a PEG
  72. case kind: PegKind
  73. of pkEmpty..pkWhitespace: nil
  74. of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string
  75. of pkChar, pkGreedyRepChar: ch: char
  76. of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
  77. of pkNonTerminal: nt: NonTerminal
  78. of pkBackRef..pkBackRefIgnoreStyle: index: range[0..MaxSubpatterns]
  79. else: sons: seq[Peg]
  80. NonTerminal* = ref NonTerminalObj
  81. proc kind*(p: Peg): PegKind = p.kind
  82. ## Returns the *PegKind* of a given *Peg* object.
  83. proc term*(p: Peg): string = p.term
  84. ## Returns the *string* representation of a given *Peg* variant object
  85. ## where present.
  86. proc ch*(p: Peg): char = p.ch
  87. ## Returns the *char* representation of a given *Peg* variant object
  88. ## where present.
  89. proc charChoice*(p: Peg): ref set[char] = p.charChoice
  90. ## Returns the *charChoice* field of a given *Peg* variant object
  91. ## where present.
  92. proc nt*(p: Peg): NonTerminal = p.nt
  93. ## Returns the *NonTerminal* object of a given *Peg* variant object
  94. ## where present.
  95. proc index*(p: Peg): range[0..MaxSubpatterns] = p.index
  96. ## Returns the back-reference index of a captured sub-pattern in the
  97. ## *Captures* object for a given *Peg* variant object where present.
  98. iterator items*(p: Peg): Peg {.inline.} =
  99. ## Yields the child nodes of a *Peg* variant object where present.
  100. for s in p.sons:
  101. yield s
  102. iterator pairs*(p: Peg): (int, Peg) {.inline.} =
  103. ## Yields the indices and child nodes of a *Peg* variant object where present.
  104. for i in 0 ..< p.sons.len:
  105. yield (i, p.sons[i])
  106. proc name*(nt: NonTerminal): string = nt.name
  107. ## Gets the name of the symbol represented by the parent *Peg* object variant
  108. ## of a given *NonTerminal*.
  109. proc line*(nt: NonTerminal): int = nt.line
  110. ## Gets the line number of the definition of the parent *Peg* object variant
  111. ## of a given *NonTerminal*.
  112. proc col*(nt: NonTerminal): int = nt.col
  113. ## Gets the column number of the definition of the parent *Peg* object variant
  114. ## of a given *NonTerminal*.
  115. proc flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags
  116. ## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant
  117. ## object of a given *NonTerminal*.
  118. proc rule*(nt: NonTerminal): Peg = nt.rule
  119. ## Gets the *Peg* object representing the rule definition of the parent *Peg*
  120. ## object variant of a given *NonTerminal*.
  121. proc term*(t: string): Peg {.noSideEffect, rtl, extern: "npegs$1Str".} =
  122. ## constructs a PEG from a terminal string
  123. if t.len != 1:
  124. result = Peg(kind: pkTerminal, term: t)
  125. else:
  126. result = Peg(kind: pkChar, ch: t[0])
  127. proc termIgnoreCase*(t: string): Peg {.
  128. noSideEffect, rtl, extern: "npegs$1".} =
  129. ## constructs a PEG from a terminal string; ignore case for matching
  130. result = Peg(kind: pkTerminalIgnoreCase, term: t)
  131. proc termIgnoreStyle*(t: string): Peg {.
  132. noSideEffect, rtl, extern: "npegs$1".} =
  133. ## constructs a PEG from a terminal string; ignore style for matching
  134. result = Peg(kind: pkTerminalIgnoreStyle, term: t)
  135. proc term*(t: char): Peg {.noSideEffect, rtl, extern: "npegs$1Char".} =
  136. ## constructs a PEG from a terminal char
  137. assert t != '\0'
  138. result = Peg(kind: pkChar, ch: t)
  139. proc charSet*(s: set[char]): Peg {.noSideEffect, rtl, extern: "npegs$1".} =
  140. ## constructs a PEG from a character set `s`
  141. assert '\0' notin s
  142. result = Peg(kind: pkCharChoice)
  143. new(result.charChoice)
  144. result.charChoice[] = s
  145. proc len(a: Peg): int {.inline.} = return a.sons.len
  146. proc add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s)
  147. proc addChoice(dest: var Peg, elem: Peg) =
  148. var L = dest.len-1
  149. if L >= 0 and dest.sons[L].kind == pkCharChoice:
  150. # caution! Do not introduce false aliasing here!
  151. case elem.kind
  152. of pkCharChoice:
  153. dest.sons[L] = charSet(dest.sons[L].charChoice[] + elem.charChoice[])
  154. of pkChar:
  155. dest.sons[L] = charSet(dest.sons[L].charChoice[] + {elem.ch})
  156. else: add(dest, elem)
  157. else: add(dest, elem)
  158. template multipleOp(k: PegKind, localOpt: untyped) =
  159. result = Peg(kind: k, sons: @[])
  160. for x in items(a):
  161. if x.kind == k:
  162. for y in items(x.sons):
  163. localOpt(result, y)
  164. else:
  165. localOpt(result, x)
  166. if result.len == 1:
  167. result = result.sons[0]
  168. proc `/`*(a: varargs[Peg]): Peg {.
  169. noSideEffect, rtl, extern: "npegsOrderedChoice".} =
  170. ## constructs an ordered choice with the PEGs in `a`
  171. multipleOp(pkOrderedChoice, addChoice)
  172. proc addSequence(dest: var Peg, elem: Peg) =
  173. var L = dest.len-1
  174. if L >= 0 and dest.sons[L].kind == pkTerminal:
  175. # caution! Do not introduce false aliasing here!
  176. case elem.kind
  177. of pkTerminal:
  178. dest.sons[L] = term(dest.sons[L].term & elem.term)
  179. of pkChar:
  180. dest.sons[L] = term(dest.sons[L].term & elem.ch)
  181. else: add(dest, elem)
  182. else: add(dest, elem)
  183. proc sequence*(a: varargs[Peg]): Peg {.
  184. noSideEffect, rtl, extern: "npegs$1".} =
  185. ## constructs a sequence with all the PEGs from `a`
  186. multipleOp(pkSequence, addSequence)
  187. proc `?`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsOptional".} =
  188. ## constructs an optional for the PEG `a`
  189. if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar,
  190. pkGreedyRepSet}:
  191. # a* ? --> a*
  192. # a? ? --> a?
  193. result = a
  194. else:
  195. result = Peg(kind: pkOption, sons: @[a])
  196. proc `*`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsGreedyRep".} =
  197. ## constructs a "greedy repetition" for the PEG `a`
  198. case a.kind
  199. of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption:
  200. assert false
  201. # produces endless loop!
  202. of pkChar:
  203. result = Peg(kind: pkGreedyRepChar, ch: a.ch)
  204. of pkCharChoice:
  205. result = Peg(kind: pkGreedyRepSet, charChoice: a.charChoice)
  206. of pkAny, pkAnyRune:
  207. result = Peg(kind: pkGreedyAny)
  208. else:
  209. result = Peg(kind: pkGreedyRep, sons: @[a])
  210. proc `!*`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsSearch".} =
  211. ## constructs a "search" for the PEG `a`
  212. result = Peg(kind: pkSearch, sons: @[a])
  213. proc `!*\`*(a: Peg): Peg {.noSideEffect, rtl,
  214. extern: "npgegsCapturedSearch".} =
  215. ## constructs a "captured search" for the PEG `a`
  216. result = Peg(kind: pkCapturedSearch, sons: @[a])
  217. proc `+`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsGreedyPosRep".} =
  218. ## constructs a "greedy positive repetition" with the PEG `a`
  219. return sequence(a, *a)
  220. proc `&`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsAndPredicate".} =
  221. ## constructs an "and predicate" with the PEG `a`
  222. result = Peg(kind: pkAndPredicate, sons: @[a])
  223. proc `!`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsNotPredicate".} =
  224. ## constructs a "not predicate" with the PEG `a`
  225. result = Peg(kind: pkNotPredicate, sons: @[a])
  226. proc any*: Peg {.inline.} =
  227. ## constructs the PEG `any character`:idx: (``.``)
  228. result = Peg(kind: pkAny)
  229. proc anyRune*: Peg {.inline.} =
  230. ## constructs the PEG `any rune`:idx: (``_``)
  231. result = Peg(kind: pkAnyRune)
  232. proc newLine*: Peg {.inline.} =
  233. ## constructs the PEG `newline`:idx: (``\n``)
  234. result = Peg(kind: pkNewLine)
  235. proc unicodeLetter*: Peg {.inline.} =
  236. ## constructs the PEG ``\letter`` which matches any Unicode letter.
  237. result = Peg(kind: pkLetter)
  238. proc unicodeLower*: Peg {.inline.} =
  239. ## constructs the PEG ``\lower`` which matches any Unicode lowercase letter.
  240. result = Peg(kind: pkLower)
  241. proc unicodeUpper*: Peg {.inline.} =
  242. ## constructs the PEG ``\upper`` which matches any Unicode uppercase letter.
  243. result = Peg(kind: pkUpper)
  244. proc unicodeTitle*: Peg {.inline.} =
  245. ## constructs the PEG ``\title`` which matches any Unicode title letter.
  246. result = Peg(kind: pkTitle)
  247. proc unicodeWhitespace*: Peg {.inline.} =
  248. ## constructs the PEG ``\white`` which matches any Unicode
  249. ## whitespace character.
  250. result = Peg(kind: pkWhitespace)
  251. proc startAnchor*: Peg {.inline.} =
  252. ## constructs the PEG ``^`` which matches the start of the input.
  253. result = Peg(kind: pkStartAnchor)
  254. proc endAnchor*: Peg {.inline.} =
  255. ## constructs the PEG ``$`` which matches the end of the input.
  256. result = !any()
  257. proc capture*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsCapture".} =
  258. ## constructs a capture with the PEG `a`
  259. result = Peg(kind: pkCapture, sons: @[a])
  260. proc backref*(index: range[1..MaxSubpatterns]): Peg {.
  261. noSideEffect, rtl, extern: "npegs$1".} =
  262. ## constructs a back reference of the given `index`. `index` starts counting
  263. ## from 1.
  264. result = Peg(kind: pkBackRef, index: index-1)
  265. proc backrefIgnoreCase*(index: range[1..MaxSubpatterns]): Peg {.
  266. noSideEffect, rtl, extern: "npegs$1".} =
  267. ## constructs a back reference of the given `index`. `index` starts counting
  268. ## from 1. Ignores case for matching.
  269. result = Peg(kind: pkBackRefIgnoreCase, index: index-1)
  270. proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns]): Peg {.
  271. noSideEffect, rtl, extern: "npegs$1".} =
  272. ## constructs a back reference of the given `index`. `index` starts counting
  273. ## from 1. Ignores style for matching.
  274. result = Peg(kind: pkBackRefIgnoreStyle, index: index-1)
  275. proc spaceCost(n: Peg): int =
  276. case n.kind
  277. of pkEmpty: discard
  278. of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
  279. pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
  280. pkAny..pkWhitespace, pkGreedyAny:
  281. result = 1
  282. of pkNonTerminal:
  283. # we cannot inline a rule with a non-terminal
  284. result = InlineThreshold+1
  285. else:
  286. for i in 0..n.len-1:
  287. inc(result, spaceCost(n.sons[i]))
  288. if result >= InlineThreshold: break
  289. proc nonterminal*(n: NonTerminal): Peg {.
  290. noSideEffect, rtl, extern: "npegs$1".} =
  291. ## constructs a PEG that consists of the nonterminal symbol
  292. assert n != nil
  293. if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold:
  294. when false: echo "inlining symbol: ", n.name
  295. result = n.rule # inlining of rule enables better optimizations
  296. else:
  297. result = Peg(kind: pkNonTerminal, nt: n)
  298. proc newNonTerminal*(name: string, line, column: int): NonTerminal {.
  299. noSideEffect, rtl, extern: "npegs$1".} =
  300. ## constructs a nonterminal symbol
  301. result = NonTerminal(name: name, line: line, col: column)
  302. template letters*: Peg =
  303. ## expands to ``charset({'A'..'Z', 'a'..'z'})``
  304. charSet({'A'..'Z', 'a'..'z'})
  305. template digits*: Peg =
  306. ## expands to ``charset({'0'..'9'})``
  307. charSet({'0'..'9'})
  308. template whitespace*: Peg =
  309. ## expands to ``charset({' ', '\9'..'\13'})``
  310. charSet({' ', '\9'..'\13'})
  311. template identChars*: Peg =
  312. ## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})``
  313. charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'})
  314. template identStartChars*: Peg =
  315. ## expands to ``charset({'A'..'Z', 'a'..'z', '_'})``
  316. charSet({'a'..'z', 'A'..'Z', '_'})
  317. template ident*: Peg =
  318. ## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier
  319. sequence(charSet({'a'..'z', 'A'..'Z', '_'}),
  320. *charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'}))
  321. template natural*: Peg =
  322. ## same as ``\d+``
  323. +digits
  324. # ------------------------- debugging -----------------------------------------
  325. proc esc(c: char, reserved = {'\0'..'\255'}): string =
  326. case c
  327. of '\b': result = "\\b"
  328. of '\t': result = "\\t"
  329. of '\c': result = "\\c"
  330. of '\L': result = "\\l"
  331. of '\v': result = "\\v"
  332. of '\f': result = "\\f"
  333. of '\e': result = "\\e"
  334. of '\a': result = "\\a"
  335. of '\\': result = "\\\\"
  336. of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
  337. elif c < ' ' or c >= '\127': result = '\\' & $ord(c)
  338. elif c in reserved: result = '\\' & c
  339. else: result = $c
  340. proc singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'"
  341. proc singleQuoteEsc(str: string): string =
  342. result = "'"
  343. for c in items(str): add result, esc(c, {'\''})
  344. add result, '\''
  345. proc charSetEscAux(cc: set[char]): string =
  346. const reserved = {'^', '-', ']'}
  347. result = ""
  348. var c1 = 0
  349. while c1 <= 0xff:
  350. if chr(c1) in cc:
  351. var c2 = c1
  352. while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
  353. if c1 == c2:
  354. add result, esc(chr(c1), reserved)
  355. elif c2 == succ(c1):
  356. add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
  357. else:
  358. add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
  359. c1 = c2
  360. inc(c1)
  361. proc charSetEsc(cc: set[char]): string =
  362. if card(cc) >= 128+64:
  363. result = "[^" & charSetEscAux({'\1'..'\xFF'} - cc) & ']'
  364. else:
  365. result = '[' & charSetEscAux(cc) & ']'
  366. proc toStrAux(r: Peg, res: var string) =
  367. case r.kind
  368. of pkEmpty: add(res, "()")
  369. of pkAny: add(res, '.')
  370. of pkAnyRune: add(res, '_')
  371. of pkLetter: add(res, "\\letter")
  372. of pkLower: add(res, "\\lower")
  373. of pkUpper: add(res, "\\upper")
  374. of pkTitle: add(res, "\\title")
  375. of pkWhitespace: add(res, "\\white")
  376. of pkNewLine: add(res, "\\n")
  377. of pkTerminal: add(res, singleQuoteEsc(r.term))
  378. of pkTerminalIgnoreCase:
  379. add(res, 'i')
  380. add(res, singleQuoteEsc(r.term))
  381. of pkTerminalIgnoreStyle:
  382. add(res, 'y')
  383. add(res, singleQuoteEsc(r.term))
  384. of pkChar: add(res, singleQuoteEsc(r.ch))
  385. of pkCharChoice: add(res, charSetEsc(r.charChoice[]))
  386. of pkNonTerminal: add(res, r.nt.name)
  387. of pkSequence:
  388. add(res, '(')
  389. toStrAux(r.sons[0], res)
  390. for i in 1 .. high(r.sons):
  391. add(res, ' ')
  392. toStrAux(r.sons[i], res)
  393. add(res, ')')
  394. of pkOrderedChoice:
  395. add(res, '(')
  396. toStrAux(r.sons[0], res)
  397. for i in 1 .. high(r.sons):
  398. add(res, " / ")
  399. toStrAux(r.sons[i], res)
  400. add(res, ')')
  401. of pkGreedyRep:
  402. toStrAux(r.sons[0], res)
  403. add(res, '*')
  404. of pkGreedyRepChar:
  405. add(res, singleQuoteEsc(r.ch))
  406. add(res, '*')
  407. of pkGreedyRepSet:
  408. add(res, charSetEsc(r.charChoice[]))
  409. add(res, '*')
  410. of pkGreedyAny:
  411. add(res, ".*")
  412. of pkOption:
  413. toStrAux(r.sons[0], res)
  414. add(res, '?')
  415. of pkAndPredicate:
  416. add(res, '&')
  417. toStrAux(r.sons[0], res)
  418. of pkNotPredicate:
  419. add(res, '!')
  420. toStrAux(r.sons[0], res)
  421. of pkSearch:
  422. add(res, '@')
  423. toStrAux(r.sons[0], res)
  424. of pkCapturedSearch:
  425. add(res, "{@}")
  426. toStrAux(r.sons[0], res)
  427. of pkCapture:
  428. add(res, '{')
  429. toStrAux(r.sons[0], res)
  430. add(res, '}')
  431. of pkBackRef:
  432. add(res, '$')
  433. add(res, $r.index)
  434. of pkBackRefIgnoreCase:
  435. add(res, "i$")
  436. add(res, $r.index)
  437. of pkBackRefIgnoreStyle:
  438. add(res, "y$")
  439. add(res, $r.index)
  440. of pkRule:
  441. toStrAux(r.sons[0], res)
  442. add(res, " <- ")
  443. toStrAux(r.sons[1], res)
  444. of pkList:
  445. for i in 0 .. high(r.sons):
  446. toStrAux(r.sons[i], res)
  447. add(res, "\n")
  448. of pkStartAnchor:
  449. add(res, '^')
  450. proc `$` *(r: Peg): string {.noSideEffect, rtl, extern: "npegsToString".} =
  451. ## converts a PEG to its string representation
  452. result = ""
  453. toStrAux(r, result)
  454. # --------------------- core engine -------------------------------------------
  455. type
  456. Captures* = object ## contains the captured substrings.
  457. matches: array[0..MaxSubpatterns-1, tuple[first, last: int]]
  458. ml: int
  459. origStart: int
  460. proc bounds*(c: Captures,
  461. i: range[0..MaxSubpatterns-1]): tuple[first, last: int] =
  462. ## returns the bounds ``[first..last]`` of the `i`'th capture.
  463. result = c.matches[i]
  464. when not useUnicode:
  465. type
  466. Rune = char
  467. template fastRuneAt(s, i, ch) =
  468. ch = s[i]
  469. inc(i)
  470. template runeLenAt(s, i): untyped = 1
  471. proc isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'}
  472. proc isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'}
  473. proc isLower(a: char): bool {.inline.} = return a in {'a'..'z'}
  474. proc isTitle(a: char): bool {.inline.} = return false
  475. proc isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'}
  476. template matchOrParse(mopProc: untyped) =
  477. # Used to make the main matcher proc *rawMatch* as well as event parser
  478. # procs. For the former, *enter* and *leave* event handler code generators
  479. # are provided which just return *discard*.
  480. proc mopProc(s: string, p: Peg, start: int, c: var Captures): int =
  481. proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int =
  482. # Parse handler code must run in an *of* clause of its own for each
  483. # *PegKind*, so we encapsulate the identical clause body for
  484. # *pkBackRef..pkBackRefIgnoreStyle* here.
  485. if p.index >= c.ml: return -1
  486. var (a, b) = c.matches[p.index]
  487. var n: Peg
  488. case p.kind
  489. of pkBackRef:
  490. n = Peg(kind: pkTerminal, term: s.substr(a, b))
  491. of pkBackRefIgnoreStyle:
  492. n = Peg(kind: pkTerminalIgnoreStyle, term: s.substr(a, b))
  493. of pkBackRefIgnoreCase:
  494. n = Peg(kind: pkTerminalIgnoreCase, term: s.substr(a, b))
  495. else: assert(false, "impossible case")
  496. mopProc(s, n, start, c)
  497. case p.kind
  498. of pkEmpty:
  499. enter(pkEmpty, s, p, start)
  500. result = 0 # match of length 0
  501. leave(pkEmpty, s, p, start, result)
  502. of pkAny:
  503. enter(pkAny, s, p, start)
  504. if start < s.len: result = 1
  505. else: result = -1
  506. leave(pkAny, s, p, start, result)
  507. of pkAnyRune:
  508. enter(pkAnyRune, s, p, start)
  509. if start < s.len:
  510. result = runeLenAt(s, start)
  511. else:
  512. result = -1
  513. leave(pkAnyRune, s, p, start, result)
  514. of pkLetter:
  515. enter(pkLetter, s, p, start)
  516. if start < s.len:
  517. var a: Rune
  518. result = start
  519. fastRuneAt(s, result, a)
  520. if isAlpha(a): dec(result, start)
  521. else: result = -1
  522. else:
  523. result = -1
  524. leave(pkLetter, s, p, start, result)
  525. of pkLower:
  526. enter(pkLower, s, p, start)
  527. if start < s.len:
  528. var a: Rune
  529. result = start
  530. fastRuneAt(s, result, a)
  531. if isLower(a): dec(result, start)
  532. else: result = -1
  533. else:
  534. result = -1
  535. leave(pkLower, s, p, start, result)
  536. of pkUpper:
  537. enter(pkUpper, s, p, start)
  538. if start < s.len:
  539. var a: Rune
  540. result = start
  541. fastRuneAt(s, result, a)
  542. if isUpper(a): dec(result, start)
  543. else: result = -1
  544. else:
  545. result = -1
  546. leave(pkUpper, s, p, start, result)
  547. of pkTitle:
  548. enter(pkTitle, s, p, start)
  549. if start < s.len:
  550. var a: Rune
  551. result = start
  552. fastRuneAt(s, result, a)
  553. if isTitle(a): dec(result, start)
  554. else: result = -1
  555. else:
  556. result = -1
  557. leave(pkTitle, s, p, start, result)
  558. of pkWhitespace:
  559. enter(pkWhitespace, s, p, start)
  560. if start < s.len:
  561. var a: Rune
  562. result = start
  563. fastRuneAt(s, result, a)
  564. if isWhiteSpace(a): dec(result, start)
  565. else: result = -1
  566. else:
  567. result = -1
  568. leave(pkWhitespace, s, p, start, result)
  569. of pkGreedyAny:
  570. enter(pkGreedyAny, s, p, start)
  571. result = len(s) - start
  572. leave(pkGreedyAny, s, p, start, result)
  573. of pkNewLine:
  574. enter(pkNewLine, s, p, start)
  575. if start < s.len and s[start] == '\L': result = 1
  576. elif start < s.len and s[start] == '\C':
  577. if start+1 < s.len and s[start+1] == '\L': result = 2
  578. else: result = 1
  579. else: result = -1
  580. leave(pkNewLine, s, p, start, result)
  581. of pkTerminal:
  582. enter(pkTerminal, s, p, start)
  583. result = len(p.term)
  584. for i in 0..result-1:
  585. if start+i >= s.len or p.term[i] != s[start+i]:
  586. result = -1
  587. break
  588. leave(pkTerminal, s, p, start, result)
  589. of pkTerminalIgnoreCase:
  590. enter(pkTerminalIgnoreCase, s, p, start)
  591. var
  592. i = 0
  593. a, b: Rune
  594. result = start
  595. while i < len(p.term):
  596. if result >= s.len:
  597. result = -1
  598. break
  599. fastRuneAt(p.term, i, a)
  600. fastRuneAt(s, result, b)
  601. if toLower(a) != toLower(b):
  602. result = -1
  603. break
  604. dec(result, start)
  605. leave(pkTerminalIgnoreCase, s, p, start, result)
  606. of pkTerminalIgnoreStyle:
  607. enter(pkTerminalIgnoreStyle, s, p, start)
  608. var
  609. i = 0
  610. a, b: Rune
  611. result = start
  612. while i < len(p.term):
  613. while i < len(p.term):
  614. fastRuneAt(p.term, i, a)
  615. if a != Rune('_'): break
  616. while result < s.len:
  617. fastRuneAt(s, result, b)
  618. if b != Rune('_'): break
  619. if result >= s.len:
  620. if i >= p.term.len: break
  621. else:
  622. result = -1
  623. break
  624. elif toLower(a) != toLower(b):
  625. result = -1
  626. break
  627. dec(result, start)
  628. leave(pkTerminalIgnoreStyle, s, p, start, result)
  629. of pkChar:
  630. enter(pkChar, s, p, start)
  631. if start < s.len and p.ch == s[start]: result = 1
  632. else: result = -1
  633. leave(pkChar, s, p, start, result)
  634. of pkCharChoice:
  635. enter(pkCharChoice, s, p, start)
  636. if start < s.len and contains(p.charChoice[], s[start]): result = 1
  637. else: result = -1
  638. leave(pkCharChoice, s, p, start, result)
  639. of pkNonTerminal:
  640. enter(pkNonTerminal, s, p, start)
  641. var oldMl = c.ml
  642. when false: echo "enter: ", p.nt.name
  643. result = mopProc(s, p.nt.rule, start, c)
  644. when false: echo "leave: ", p.nt.name
  645. if result < 0: c.ml = oldMl
  646. leave(pkNonTerminal, s, p, start, result)
  647. of pkSequence:
  648. enter(pkSequence, s, p, start)
  649. var oldMl = c.ml
  650. result = 0
  651. for i in 0..high(p.sons):
  652. var x = mopProc(s, p.sons[i], start+result, c)
  653. if x < 0:
  654. c.ml = oldMl
  655. result = -1
  656. break
  657. else: inc(result, x)
  658. leave(pkSequence, s, p, start, result)
  659. of pkOrderedChoice:
  660. enter(pkOrderedChoice, s, p, start)
  661. var oldMl = c.ml
  662. for i in 0..high(p.sons):
  663. result = mopProc(s, p.sons[i], start, c)
  664. if result >= 0: break
  665. c.ml = oldMl
  666. leave(pkOrderedChoice, s, p, start, result)
  667. of pkSearch:
  668. enter(pkSearch, s, p, start)
  669. var oldMl = c.ml
  670. result = 0
  671. while start+result <= s.len:
  672. var x = mopProc(s, p.sons[0], start+result, c)
  673. if x >= 0:
  674. inc(result, x)
  675. leave(pkSearch, s, p, start, result)
  676. return
  677. inc(result)
  678. result = -1
  679. c.ml = oldMl
  680. leave(pkSearch, s, p, start, result)
  681. of pkCapturedSearch:
  682. enter(pkCapturedSearch, s, p, start)
  683. var idx = c.ml # reserve a slot for the subpattern
  684. inc(c.ml)
  685. result = 0
  686. while start+result <= s.len:
  687. var x = mopProc(s, p.sons[0], start+result, c)
  688. if x >= 0:
  689. if idx < MaxSubpatterns:
  690. c.matches[idx] = (start, start+result-1)
  691. #else: silently ignore the capture
  692. inc(result, x)
  693. leave(pkCapturedSearch, s, p, start, result)
  694. return
  695. inc(result)
  696. result = -1
  697. c.ml = idx
  698. leave(pkCapturedSearch, s, p, start, result)
  699. of pkGreedyRep:
  700. enter(pkGreedyRep, s, p, start)
  701. result = 0
  702. while true:
  703. var x = mopProc(s, p.sons[0], start+result, c)
  704. # if x == 0, we have an endless loop; so the correct behaviour would be
  705. # not to break. But endless loops can be easily introduced:
  706. # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the
  707. # expected thing in this case.
  708. if x <= 0: break
  709. inc(result, x)
  710. leave(pkGreedyRep, s, p, start, result)
  711. of pkGreedyRepChar:
  712. enter(pkGreedyRepChar, s, p, start)
  713. result = 0
  714. var ch = p.ch
  715. while start+result < s.len and ch == s[start+result]: inc(result)
  716. leave(pkGreedyRepChar, s, p, start, result)
  717. of pkGreedyRepSet:
  718. enter(pkGreedyRepSet, s, p, start)
  719. result = 0
  720. while start+result < s.len and contains(p.charChoice[], s[start+result]):
  721. inc(result)
  722. leave(pkGreedyRepSet, s, p, start, result)
  723. of pkOption:
  724. enter(pkOption, s, p, start)
  725. result = max(0, mopProc(s, p.sons[0], start, c))
  726. leave(pkOption, s, p, start, result)
  727. of pkAndPredicate:
  728. enter(pkAndPredicate, s, p, start)
  729. var oldMl = c.ml
  730. result = mopProc(s, p.sons[0], start, c)
  731. if result >= 0: result = 0 # do not consume anything
  732. else: c.ml = oldMl
  733. leave(pkAndPredicate, s, p, start, result)
  734. of pkNotPredicate:
  735. enter(pkNotPredicate, s, p, start)
  736. var oldMl = c.ml
  737. result = mopProc(s, p.sons[0], start, c)
  738. if result < 0: result = 0
  739. else:
  740. c.ml = oldMl
  741. result = -1
  742. leave(pkNotPredicate, s, p, start, result)
  743. of pkCapture:
  744. enter(pkCapture, s, p, start)
  745. var idx = c.ml # reserve a slot for the subpattern
  746. inc(c.ml)
  747. result = mopProc(s, p.sons[0], start, c)
  748. if result >= 0:
  749. if idx < MaxSubpatterns:
  750. c.matches[idx] = (start, start+result-1)
  751. #else: silently ignore the capture
  752. else:
  753. c.ml = idx
  754. leave(pkCapture, s, p, start, result)
  755. of pkBackRef:
  756. enter(pkBackRef, s, p, start)
  757. result = matchBackRef(s, p, start, c)
  758. leave(pkBackRef, s, p, start, result)
  759. of pkBackRefIgnoreCase:
  760. enter(pkBackRefIgnoreCase, s, p, start)
  761. result = matchBackRef(s, p, start, c)
  762. leave(pkBackRefIgnoreCase, s, p, start, result)
  763. of pkBackRefIgnoreStyle:
  764. enter(pkBackRefIgnoreStyle, s, p, start)
  765. result = matchBackRef(s, p, start, c)
  766. leave(pkBackRefIgnoreStyle, s, p, start, result)
  767. of pkStartAnchor:
  768. enter(pkStartAnchor, s, p, start)
  769. if c.origStart == start: result = 0
  770. else: result = -1
  771. leave(pkStartAnchor, s, p, start, result)
  772. of pkRule, pkList: assert false
  773. proc rawMatch*(s: string, p: Peg, start: int, c: var Captures): int
  774. {.noSideEffect, rtl, extern: "npegs$1".} =
  775. ## low-level matching proc that implements the PEG interpreter. Use this
  776. ## for maximum efficiency (every other PEG operation ends up calling this
  777. ## proc).
  778. ## Returns -1 if it does not match, else the length of the match
  779. # Set the handler generators to produce do-nothing handlers.
  780. template enter(pk, s, p, start) =
  781. discard
  782. template leave(pk, s, p, start, length) =
  783. discard
  784. matchOrParse(matchIt)
  785. result = matchIt(s, p, start, c)
  786. macro mkHandlerTplts(handlers: untyped): untyped =
  787. # Transforms the handler spec in *handlers* into handler templates.
  788. # The AST structure of *handlers[0]*:
  789. #
  790. # .. code-block::
  791. # StmtList
  792. # Call
  793. # Ident "pkNonTerminal"
  794. # StmtList
  795. # Call
  796. # Ident "enter"
  797. # StmtList
  798. # <handler code block>
  799. # Call
  800. # Ident "leave"
  801. # StmtList
  802. # <handler code block>
  803. # Call
  804. # Ident "pkChar"
  805. # StmtList
  806. # Call
  807. # Ident "leave"
  808. # StmtList
  809. # <handler code block>
  810. # ...
  811. proc mkEnter(hdName, body: NimNode): NimNode =
  812. template helper(hdName, body) {.dirty.} =
  813. template hdName(s, p, start) =
  814. let s {.inject.} = s
  815. let p {.inject.} = p
  816. let start {.inject.} = start
  817. body
  818. result = getAst(helper(hdName, body))
  819. template mkLeave(hdPostf, body) {.dirty.} =
  820. # this has to be dirty to be able to capture *result* as *length* in
  821. # *leaveXX* calls.
  822. template `leave hdPostf`(s, p, start, length) =
  823. body
  824. result = newStmtList()
  825. for topCall in handlers[0]:
  826. if topCall.kind notin nnkCallKinds:
  827. error("Call syntax expected.", topCall)
  828. let pegKind = topCall[0]
  829. if pegKind.kind notin {nnkIdent, nnkSym}:
  830. error("PegKind expected.", pegKind)
  831. if 2 == topCall.len:
  832. for hdDef in topCall[1]:
  833. if hdDef.kind notin nnkCallKinds:
  834. error("Call syntax expected.", hdDef)
  835. if hdDef[0].kind notin {nnkIdent, nnkSym}:
  836. error("Handler identifier expected.", hdDef[0])
  837. if 2 == hdDef.len:
  838. let hdPostf = substr(pegKind.strVal, 2)
  839. case hdDef[0].strVal
  840. of "enter":
  841. result.add mkEnter(newIdentNode("enter" & hdPostf), hdDef[1])
  842. of "leave":
  843. result.add getAst(mkLeave(ident(hdPostf), hdDef[1]))
  844. else:
  845. error(
  846. "Unsupported handler identifier, expected 'enter' or 'leave'.",
  847. hdDef[0]
  848. )
  849. template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) =
  850. ## Generates an interpreting event parser *proc* according to the specified
  851. ## PEG AST and handler code blocks. The *proc* can be called with a string
  852. ## to be parsed and will execute the handler code blocks whenever their
  853. ## associated grammar element is matched. It returns -1 if the string does not
  854. ## match, else the length of the total match. The following example code
  855. ## evaluates an arithmetic expression defined by a simple PEG:
  856. ##
  857. ## .. code-block:: nim
  858. ## import strutils, pegs
  859. ##
  860. ## let
  861. ## pegAst = """
  862. ## Expr <- Sum
  863. ## Sum <- Product (('+' / '-')Product)*
  864. ## Product <- Value (('*' / '/')Value)*
  865. ## Value <- [0-9]+ / '(' Expr ')'
  866. ## """.peg
  867. ## txt = "(5+3)/2-7*22"
  868. ##
  869. ## var
  870. ## pStack: seq[string] = @[]
  871. ## valStack: seq[float] = @[]
  872. ## opStack = ""
  873. ## let
  874. ## parseArithExpr = pegAst.eventParser:
  875. ## pkNonTerminal:
  876. ## enter:
  877. ## pStack.add p.nt.name
  878. ## leave:
  879. ## pStack.setLen pStack.high
  880. ## if length > 0:
  881. ## let matchStr = s.substr(start, start+length-1)
  882. ## case p.nt.name
  883. ## of "Value":
  884. ## try:
  885. ## valStack.add matchStr.parseFloat
  886. ## echo valStack
  887. ## except ValueError:
  888. ## discard
  889. ## of "Sum", "Product":
  890. ## try:
  891. ## let val = matchStr.parseFloat
  892. ## except ValueError:
  893. ## if valStack.len > 1 and opStack.len > 0:
  894. ## valStack[^2] = case opStack[^1]
  895. ## of '+': valStack[^2] + valStack[^1]
  896. ## of '-': valStack[^2] - valStack[^1]
  897. ## of '*': valStack[^2] * valStack[^1]
  898. ## else: valStack[^2] / valStack[^1]
  899. ## valStack.setLen valStack.high
  900. ## echo valStack
  901. ## opStack.setLen opStack.high
  902. ## echo opStack
  903. ## pkChar:
  904. ## leave:
  905. ## if length == 1 and "Value" != pStack[^1]:
  906. ## let matchChar = s[start]
  907. ## opStack.add matchChar
  908. ## echo opStack
  909. ##
  910. ## let pLen = parseArithExpr(txt)
  911. ##
  912. ## The *handlers* parameter consists of code blocks for *PegKinds*,
  913. ## which define the grammar elements of interest. Each block can contain
  914. ## handler code to be executed when the parser enters and leaves text
  915. ## matching the grammar element. An *enter* handler can access the specific
  916. ## PEG AST node being matched as *p*, the entire parsed string as *s*
  917. ## and the position of the matched text segment in *s* as *start*. A *leave*
  918. ## handler can access *p*, *s*, *start* and also the length of the matched
  919. ## text segment as *length*. For an unsuccessful match, the *enter* and
  920. ## *leave* handlers will be executed, with *length* set to -1.
  921. ##
  922. ## Symbols declared in an *enter* handler can be made visible in the
  923. ## corresponding *leave* handler by annotating them with an *inject* pragma.
  924. proc rawParse(s: string, p: Peg, start: int, c: var Captures): int
  925. {.genSym.} =
  926. # binding from *macros*
  927. bind strVal
  928. mkHandlerTplts:
  929. handlers
  930. macro enter(pegKind, s, pegNode, start: untyped): untyped =
  931. # This is called by the matcher code in *matchOrParse* at the
  932. # start of the code for a grammar element of kind *pegKind*.
  933. # Expands to a call to the handler template if one was generated
  934. # by *mkHandlerTplts*.
  935. template mkDoEnter(hdPostf, s, pegNode, start) =
  936. when declared(`enter hdPostf`):
  937. `enter hdPostf`(s, pegNode, start):
  938. else:
  939. discard
  940. let hdPostf = ident(substr(strVal(pegKind), 2))
  941. getAst(mkDoEnter(hdPostf, s, pegNode, start))
  942. macro leave(pegKind, s, pegNode, start, length: untyped): untyped =
  943. # Like *enter*, but called at the end of the matcher code for
  944. # a grammar element of kind *pegKind*.
  945. template mkDoLeave(hdPostf, s, pegNode, start, length) =
  946. when declared(`leave hdPostf`):
  947. `leave hdPostf`(s, pegNode, start, length):
  948. else:
  949. discard
  950. let hdPostf = ident(substr(strVal(pegKind), 2))
  951. getAst(mkDoLeave(hdPostf, s, pegNode, start, length))
  952. matchOrParse(parseIt)
  953. parseIt(s, p, start, c)
  954. proc parser(s: string): int {.genSym.} =
  955. # the proc to be returned
  956. var
  957. ms: array[MaxSubpatterns, (int, int)]
  958. cs = Captures(matches: ms, ml: 0, origStart: 0)
  959. rawParse(s, pegAst, 0, cs)
  960. parser
  961. template fillMatches(s, caps, c) =
  962. for k in 0..c.ml-1:
  963. let startIdx = c.matches[k][0]
  964. let endIdx = c.matches[k][1]
  965. if startIdx != -1:
  966. caps[k] = substr(s, startIdx, endIdx)
  967. else:
  968. caps[k] = ""
  969. proc matchLen*(s: string, pattern: Peg, matches: var openArray[string],
  970. start = 0): int {.noSideEffect, rtl, extern: "npegs$1Capture".} =
  971. ## the same as ``match``, but it returns the length of the match,
  972. ## if there is no match, -1 is returned. Note that a match length
  973. ## of zero can happen. It's possible that a suffix of `s` remains
  974. ## that does not belong to the match.
  975. var c: Captures
  976. c.origStart = start
  977. result = rawMatch(s, pattern, start, c)
  978. if result >= 0: fillMatches(s, matches, c)
  979. proc matchLen*(s: string, pattern: Peg,
  980. start = 0): int {.noSideEffect, rtl, extern: "npegs$1".} =
  981. ## the same as ``match``, but it returns the length of the match,
  982. ## if there is no match, -1 is returned. Note that a match length
  983. ## of zero can happen. It's possible that a suffix of `s` remains
  984. ## that does not belong to the match.
  985. var c: Captures
  986. c.origStart = start
  987. result = rawMatch(s, pattern, start, c)
  988. proc match*(s: string, pattern: Peg, matches: var openArray[string],
  989. start = 0): bool {.noSideEffect, rtl, extern: "npegs$1Capture".} =
  990. ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and
  991. ## the captured substrings in the array ``matches``. If it does not
  992. ## match, nothing is written into ``matches`` and ``false`` is
  993. ## returned.
  994. result = matchLen(s, pattern, matches, start) != -1
  995. proc match*(s: string, pattern: Peg,
  996. start = 0): bool {.noSideEffect, rtl, extern: "npegs$1".} =
  997. ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
  998. result = matchLen(s, pattern, start) != -1
  999. proc find*(s: string, pattern: Peg, matches: var openArray[string],
  1000. start = 0): int {.noSideEffect, rtl, extern: "npegs$1Capture".} =
  1001. ## returns the starting position of ``pattern`` in ``s`` and the captured
  1002. ## substrings in the array ``matches``. If it does not match, nothing
  1003. ## is written into ``matches`` and -1 is returned.
  1004. var c: Captures
  1005. c.origStart = start
  1006. for i in start .. s.len-1:
  1007. c.ml = 0
  1008. if rawMatch(s, pattern, i, c) >= 0:
  1009. fillMatches(s, matches, c)
  1010. return i
  1011. return -1
  1012. # could also use the pattern here: (!P .)* P
  1013. proc findBounds*(s: string, pattern: Peg, matches: var openArray[string],
  1014. start = 0): tuple[first, last: int] {.
  1015. noSideEffect, rtl, extern: "npegs$1Capture".} =
  1016. ## returns the starting position and end position of ``pattern`` in ``s``
  1017. ## and the captured
  1018. ## substrings in the array ``matches``. If it does not match, nothing
  1019. ## is written into ``matches`` and (-1,0) is returned.
  1020. var c: Captures
  1021. c.origStart = start
  1022. for i in start .. s.len-1:
  1023. c.ml = 0
  1024. var L = rawMatch(s, pattern, i, c)
  1025. if L >= 0:
  1026. fillMatches(s, matches, c)
  1027. return (i, i+L-1)
  1028. return (-1, 0)
  1029. proc find*(s: string, pattern: Peg,
  1030. start = 0): int {.noSideEffect, rtl, extern: "npegs$1".} =
  1031. ## returns the starting position of ``pattern`` in ``s``. If it does not
  1032. ## match, -1 is returned.
  1033. var c: Captures
  1034. c.origStart = start
  1035. for i in start .. s.len-1:
  1036. if rawMatch(s, pattern, i, c) >= 0: return i
  1037. return -1
  1038. iterator findAll*(s: string, pattern: Peg, start = 0): string =
  1039. ## yields all matching *substrings* of `s` that match `pattern`.
  1040. var c: Captures
  1041. c.origStart = start
  1042. var i = start
  1043. while i < s.len:
  1044. c.ml = 0
  1045. var L = rawMatch(s, pattern, i, c)
  1046. if L < 0:
  1047. inc(i, 1)
  1048. else:
  1049. yield substr(s, i, i+L-1)
  1050. inc(i, L)
  1051. proc findAll*(s: string, pattern: Peg, start = 0): seq[string] {.
  1052. noSideEffect, rtl, extern: "npegs$1".} =
  1053. ## returns all matching *substrings* of `s` that match `pattern`.
  1054. ## If it does not match, @[] is returned.
  1055. result = @[]
  1056. for it in findAll(s, pattern, start): result.add it
  1057. when not defined(nimhygiene):
  1058. {.pragma: inject.}
  1059. template `=~`*(s: string, pattern: Peg): bool =
  1060. ## This calls ``match`` with an implicit declared ``matches`` array that
  1061. ## can be used in the scope of the ``=~`` call:
  1062. ##
  1063. ## .. code-block:: nim
  1064. ##
  1065. ## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}":
  1066. ## # matches a key=value pair:
  1067. ## echo("Key: ", matches[0])
  1068. ## echo("Value: ", matches[1])
  1069. ## elif line =~ peg"\s*{'#'.*}":
  1070. ## # matches a comment
  1071. ## # note that the implicit ``matches`` array is different from the
  1072. ## # ``matches`` array of the first branch
  1073. ## echo("comment: ", matches[0])
  1074. ## else:
  1075. ## echo("syntax error")
  1076. ##
  1077. bind MaxSubpatterns
  1078. when not declaredInScope(matches):
  1079. var matches {.inject.}: array[0..MaxSubpatterns-1, string]
  1080. match(s, pattern, matches)
  1081. # ------------------------- more string handling ------------------------------
  1082. proc contains*(s: string, pattern: Peg, start = 0): bool {.
  1083. noSideEffect, rtl, extern: "npegs$1".} =
  1084. ## same as ``find(s, pattern, start) >= 0``
  1085. return find(s, pattern, start) >= 0
  1086. proc contains*(s: string, pattern: Peg, matches: var openArray[string],
  1087. start = 0): bool {.noSideEffect, rtl, extern: "npegs$1Capture".} =
  1088. ## same as ``find(s, pattern, matches, start) >= 0``
  1089. return find(s, pattern, matches, start) >= 0
  1090. proc startsWith*(s: string, prefix: Peg, start = 0): bool {.
  1091. noSideEffect, rtl, extern: "npegs$1".} =
  1092. ## returns true if `s` starts with the pattern `prefix`
  1093. result = matchLen(s, prefix, start) >= 0
  1094. proc endsWith*(s: string, suffix: Peg, start = 0): bool {.
  1095. noSideEffect, rtl, extern: "npegs$1".} =
  1096. ## returns true if `s` ends with the pattern `suffix`
  1097. var c: Captures
  1098. c.origStart = start
  1099. for i in start .. s.len-1:
  1100. if rawMatch(s, suffix, i, c) == s.len - i: return true
  1101. proc replacef*(s: string, sub: Peg, by: string): string {.
  1102. noSideEffect, rtl, extern: "npegs$1".} =
  1103. ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by`
  1104. ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples:
  1105. ##
  1106. ## .. code-block:: nim
  1107. ## "var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2")
  1108. ##
  1109. ## Results in:
  1110. ##
  1111. ## .. code-block:: nim
  1112. ##
  1113. ## "var1<-keykey; val2<-key2key2"
  1114. result = ""
  1115. var i = 0
  1116. var caps: array[0..MaxSubpatterns-1, string]
  1117. var c: Captures
  1118. while i < s.len:
  1119. c.ml = 0
  1120. var x = rawMatch(s, sub, i, c)
  1121. if x <= 0:
  1122. add(result, s[i])
  1123. inc(i)
  1124. else:
  1125. fillMatches(s, caps, c)
  1126. addf(result, by, caps)
  1127. inc(i, x)
  1128. add(result, substr(s, i))
  1129. proc replace*(s: string, sub: Peg, by = ""): string {.
  1130. noSideEffect, rtl, extern: "npegs$1".} =
  1131. ## Replaces `sub` in `s` by the string `by`. Captures cannot be accessed
  1132. ## in `by`.
  1133. result = ""
  1134. var i = 0
  1135. var c: Captures
  1136. while i < s.len:
  1137. var x = rawMatch(s, sub, i, c)
  1138. if x <= 0:
  1139. add(result, s[i])
  1140. inc(i)
  1141. else:
  1142. add(result, by)
  1143. inc(i, x)
  1144. add(result, substr(s, i))
  1145. proc parallelReplace*(s: string, subs: varargs[
  1146. tuple[pattern: Peg, repl: string]]): string {.
  1147. noSideEffect, rtl, extern: "npegs$1".} =
  1148. ## Returns a modified copy of `s` with the substitutions in `subs`
  1149. ## applied in parallel.
  1150. result = ""
  1151. var i = 0
  1152. var c: Captures
  1153. var caps: array[0..MaxSubpatterns-1, string]
  1154. while i < s.len:
  1155. block searchSubs:
  1156. for j in 0..high(subs):
  1157. c.ml = 0
  1158. var x = rawMatch(s, subs[j][0], i, c)
  1159. if x > 0:
  1160. fillMatches(s, caps, c)
  1161. addf(result, subs[j][1], caps)
  1162. inc(i, x)
  1163. break searchSubs
  1164. add(result, s[i])
  1165. inc(i)
  1166. # copy the rest:
  1167. add(result, substr(s, i))
  1168. proc replace*(s: string, sub: Peg, cb: proc(
  1169. match: int, cnt: int, caps: openArray[string]): string): string {.
  1170. rtl, extern: "npegs$1cb".} =
  1171. ## Replaces `sub` in `s` by the resulting strings from the callback.
  1172. ## The callback proc receives the index of the current match (starting with 0),
  1173. ## the count of captures and an open array with the captures of each match. Examples:
  1174. ##
  1175. ## .. code-block:: nim
  1176. ##
  1177. ## proc handleMatches*(m: int, n: int, c: openArray[string]): string =
  1178. ## result = ""
  1179. ## if m > 0:
  1180. ## result.add ", "
  1181. ## result.add case n:
  1182. ## of 2: c[0].toLower & ": '" & c[1] & "'"
  1183. ## of 1: c[0].toLower & ": ''"
  1184. ## else: ""
  1185. ##
  1186. ## let s = "Var1=key1;var2=Key2; VAR3"
  1187. ## echo s.replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches)
  1188. ##
  1189. ## Results in:
  1190. ##
  1191. ## .. code-block:: nim
  1192. ##
  1193. ## "var1: 'key1', var2: 'Key2', var3: ''"
  1194. result = ""
  1195. var i = 0
  1196. var caps: array[0..MaxSubpatterns-1, string]
  1197. var c: Captures
  1198. var m = 0
  1199. while i < s.len:
  1200. c.ml = 0
  1201. var x = rawMatch(s, sub, i, c)
  1202. if x <= 0:
  1203. add(result, s[i])
  1204. inc(i)
  1205. else:
  1206. fillMatches(s, caps, c)
  1207. add(result, cb(m, c.ml, caps))
  1208. inc(i, x)
  1209. inc(m)
  1210. add(result, substr(s, i))
  1211. when not defined(js):
  1212. proc transformFile*(infile, outfile: string,
  1213. subs: varargs[tuple[pattern: Peg, repl: string]]) {.
  1214. rtl, extern: "npegs$1".} =
  1215. ## reads in the file `infile`, performs a parallel replacement (calls
  1216. ## `parallelReplace`) and writes back to `outfile`. Raises ``IOError`` if an
  1217. ## error occurs. This is supposed to be used for quick scripting.
  1218. ##
  1219. ## **Note**: this proc does not exist while using the JS backend.
  1220. var x = readFile(infile).string
  1221. writeFile(outfile, x.parallelReplace(subs))
  1222. iterator split*(s: string, sep: Peg): string =
  1223. ## Splits the string `s` into substrings.
  1224. ##
  1225. ## Substrings are separated by the PEG `sep`.
  1226. ## Examples:
  1227. ##
  1228. ## .. code-block:: nim
  1229. ## for word in split("00232this02939is39an22example111", peg"\d+"):
  1230. ## writeLine(stdout, word)
  1231. ##
  1232. ## Results in:
  1233. ##
  1234. ## .. code-block:: nim
  1235. ## "this"
  1236. ## "is"
  1237. ## "an"
  1238. ## "example"
  1239. ##
  1240. var c: Captures
  1241. var
  1242. first = 0
  1243. last = 0
  1244. while last < len(s):
  1245. c.ml = 0
  1246. var x = rawMatch(s, sep, last, c)
  1247. if x > 0: inc(last, x)
  1248. first = last
  1249. while last < len(s):
  1250. inc(last)
  1251. c.ml = 0
  1252. x = rawMatch(s, sep, last, c)
  1253. if x > 0: break
  1254. if first < last:
  1255. yield substr(s, first, last-1)
  1256. proc split*(s: string, sep: Peg): seq[string] {.
  1257. noSideEffect, rtl, extern: "npegs$1".} =
  1258. ## Splits the string `s` into substrings.
  1259. result = @[]
  1260. for it in split(s, sep): result.add it
  1261. # ------------------- scanner -------------------------------------------------
  1262. type
  1263. Modifier = enum
  1264. modNone,
  1265. modVerbatim,
  1266. modIgnoreCase,
  1267. modIgnoreStyle
  1268. TokKind = enum ## enumeration of all tokens
  1269. tkInvalid, ## invalid token
  1270. tkEof, ## end of file reached
  1271. tkAny, ## .
  1272. tkAnyRune, ## _
  1273. tkIdentifier, ## abc
  1274. tkStringLit, ## "abc" or 'abc'
  1275. tkCharSet, ## [^A-Z]
  1276. tkParLe, ## '('
  1277. tkParRi, ## ')'
  1278. tkCurlyLe, ## '{'
  1279. tkCurlyRi, ## '}'
  1280. tkCurlyAt, ## '{@}'
  1281. tkArrow, ## '<-'
  1282. tkBar, ## '/'
  1283. tkStar, ## '*'
  1284. tkPlus, ## '+'
  1285. tkAmp, ## '&'
  1286. tkNot, ## '!'
  1287. tkOption, ## '?'
  1288. tkAt, ## '@'
  1289. tkBuiltin, ## \identifier
  1290. tkEscaped, ## \\
  1291. tkBackref, ## '$'
  1292. tkDollar, ## '$'
  1293. tkHat ## '^'
  1294. Token {.final.} = object ## a token
  1295. kind: TokKind ## the type of the token
  1296. modifier: Modifier
  1297. literal: string ## the parsed (string) literal
  1298. charset: set[char] ## if kind == tkCharSet
  1299. index: int ## if kind == tkBackref
  1300. PegLexer {.inheritable.} = object ## the lexer object.
  1301. bufpos: int ## the current position within the buffer
  1302. buf: string ## the buffer itself
  1303. lineNumber: int ## the current line number
  1304. lineStart: int ## index of last line start in buffer
  1305. colOffset: int ## column to add
  1306. filename: string
  1307. const
  1308. tokKindToStr: array[TokKind, string] = [
  1309. "invalid", "[EOF]", ".", "_", "identifier", "string literal",
  1310. "character set", "(", ")", "{", "}", "{@}",
  1311. "<-", "/", "*", "+", "&", "!", "?",
  1312. "@", "built-in", "escaped", "$", "$", "^"
  1313. ]
  1314. proc handleCR(L: var PegLexer, pos: int): int =
  1315. assert(L.buf[pos] == '\c')
  1316. inc(L.lineNumber)
  1317. result = pos+1
  1318. if result < L.buf.len and L.buf[result] == '\L': inc(result)
  1319. L.lineStart = result
  1320. proc handleLF(L: var PegLexer, pos: int): int =
  1321. assert(L.buf[pos] == '\L')
  1322. inc(L.lineNumber)
  1323. result = pos+1
  1324. L.lineStart = result
  1325. proc init(L: var PegLexer, input, filename: string, line = 1, col = 0) =
  1326. L.buf = input
  1327. L.bufpos = 0
  1328. L.lineNumber = line
  1329. L.colOffset = col
  1330. L.lineStart = 0
  1331. L.filename = filename
  1332. proc getColumn(L: PegLexer): int {.inline.} =
  1333. result = abs(L.bufpos - L.lineStart) + L.colOffset
  1334. proc getLine(L: PegLexer): int {.inline.} =
  1335. result = L.lineNumber
  1336. proc errorStr(L: PegLexer, msg: string, line = -1, col = -1): string =
  1337. var line = if line < 0: getLine(L) else: line
  1338. var col = if col < 0: getColumn(L) else: col
  1339. result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg]
  1340. proc handleHexChar(c: var PegLexer, xi: var int) =
  1341. case c.buf[c.bufpos]
  1342. of '0'..'9':
  1343. xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0'))
  1344. inc(c.bufpos)
  1345. of 'a'..'f':
  1346. xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10)
  1347. inc(c.bufpos)
  1348. of 'A'..'F':
  1349. xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10)
  1350. inc(c.bufpos)
  1351. else: discard
  1352. proc getEscapedChar(c: var PegLexer, tok: var Token) =
  1353. inc(c.bufpos)
  1354. if c.bufpos >= len(c.buf):
  1355. tok.kind = tkInvalid
  1356. return
  1357. case c.buf[c.bufpos]
  1358. of 'r', 'R', 'c', 'C':
  1359. add(tok.literal, '\c')
  1360. inc(c.bufpos)
  1361. of 'l', 'L':
  1362. add(tok.literal, '\L')
  1363. inc(c.bufpos)
  1364. of 'f', 'F':
  1365. add(tok.literal, '\f')
  1366. inc(c.bufpos)
  1367. of 'e', 'E':
  1368. add(tok.literal, '\e')
  1369. inc(c.bufpos)
  1370. of 'a', 'A':
  1371. add(tok.literal, '\a')
  1372. inc(c.bufpos)
  1373. of 'b', 'B':
  1374. add(tok.literal, '\b')
  1375. inc(c.bufpos)
  1376. of 'v', 'V':
  1377. add(tok.literal, '\v')
  1378. inc(c.bufpos)
  1379. of 't', 'T':
  1380. add(tok.literal, '\t')
  1381. inc(c.bufpos)
  1382. of 'x', 'X':
  1383. inc(c.bufpos)
  1384. if c.bufpos >= len(c.buf):
  1385. tok.kind = tkInvalid
  1386. return
  1387. var xi = 0
  1388. handleHexChar(c, xi)
  1389. handleHexChar(c, xi)
  1390. if xi == 0: tok.kind = tkInvalid
  1391. else: add(tok.literal, chr(xi))
  1392. of '0'..'9':
  1393. var val = ord(c.buf[c.bufpos]) - ord('0')
  1394. inc(c.bufpos)
  1395. var i = 1
  1396. while (c.bufpos < len(c.buf)) and (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}):
  1397. val = val * 10 + ord(c.buf[c.bufpos]) - ord('0')
  1398. inc(c.bufpos)
  1399. inc(i)
  1400. if val > 0 and val <= 255: add(tok.literal, chr(val))
  1401. else: tok.kind = tkInvalid
  1402. of '\0'..'\31':
  1403. tok.kind = tkInvalid
  1404. elif c.buf[c.bufpos] in strutils.Letters:
  1405. tok.kind = tkInvalid
  1406. else:
  1407. add(tok.literal, c.buf[c.bufpos])
  1408. inc(c.bufpos)
  1409. proc skip(c: var PegLexer) =
  1410. var pos = c.bufpos
  1411. while pos < c.buf.len:
  1412. case c.buf[pos]
  1413. of ' ', '\t':
  1414. inc(pos)
  1415. of '#':
  1416. while (pos < c.buf.len) and
  1417. not (c.buf[pos] in {'\c', '\L', '\0'}): inc(pos)
  1418. of '\c':
  1419. pos = handleCR(c, pos)
  1420. of '\L':
  1421. pos = handleLF(c, pos)
  1422. else:
  1423. break # EndOfFile also leaves the loop
  1424. c.bufpos = pos
  1425. proc getString(c: var PegLexer, tok: var Token) =
  1426. tok.kind = tkStringLit
  1427. var pos = c.bufpos + 1
  1428. var quote = c.buf[pos-1]
  1429. while pos < c.buf.len:
  1430. case c.buf[pos]
  1431. of '\\':
  1432. c.bufpos = pos
  1433. getEscapedChar(c, tok)
  1434. pos = c.bufpos
  1435. of '\c', '\L', '\0':
  1436. tok.kind = tkInvalid
  1437. break
  1438. elif c.buf[pos] == quote:
  1439. inc(pos)
  1440. break
  1441. else:
  1442. add(tok.literal, c.buf[pos])
  1443. inc(pos)
  1444. c.bufpos = pos
  1445. proc getDollar(c: var PegLexer, tok: var Token) =
  1446. var pos = c.bufpos + 1
  1447. if pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
  1448. tok.kind = tkBackref
  1449. tok.index = 0
  1450. while pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
  1451. tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0')
  1452. inc(pos)
  1453. else:
  1454. tok.kind = tkDollar
  1455. c.bufpos = pos
  1456. proc getCharSet(c: var PegLexer, tok: var Token) =
  1457. tok.kind = tkCharSet
  1458. tok.charset = {}
  1459. var pos = c.bufpos + 1
  1460. var caret = false
  1461. if pos < c.buf.len:
  1462. if c.buf[pos] == '^':
  1463. inc(pos)
  1464. caret = true
  1465. while pos < c.buf.len:
  1466. var ch: char
  1467. case c.buf[pos]
  1468. of ']':
  1469. if pos < c.buf.len: inc(pos)
  1470. break
  1471. of '\\':
  1472. c.bufpos = pos
  1473. getEscapedChar(c, tok)
  1474. pos = c.bufpos
  1475. ch = tok.literal[tok.literal.len-1]
  1476. of '\C', '\L', '\0':
  1477. tok.kind = tkInvalid
  1478. break
  1479. else:
  1480. ch = c.buf[pos]
  1481. inc(pos)
  1482. incl(tok.charset, ch)
  1483. if c.buf[pos] == '-':
  1484. if pos+1 < c.buf.len and c.buf[pos+1] == ']':
  1485. incl(tok.charset, '-')
  1486. inc(pos)
  1487. else:
  1488. if pos+1 < c.buf.len:
  1489. inc(pos)
  1490. else:
  1491. break
  1492. var ch2: char
  1493. case c.buf[pos]
  1494. of '\\':
  1495. c.bufpos = pos
  1496. getEscapedChar(c, tok)
  1497. pos = c.bufpos
  1498. ch2 = tok.literal[tok.literal.len-1]
  1499. of '\C', '\L', '\0':
  1500. tok.kind = tkInvalid
  1501. break
  1502. else:
  1503. if pos+1 < c.buf.len:
  1504. ch2 = c.buf[pos]
  1505. inc(pos)
  1506. else:
  1507. break
  1508. for i in ord(ch)+1 .. ord(ch2):
  1509. incl(tok.charset, chr(i))
  1510. c.bufpos = pos
  1511. if caret: tok.charset = {'\1'..'\xFF'} - tok.charset
  1512. proc getSymbol(c: var PegLexer, tok: var Token) =
  1513. var pos = c.bufpos
  1514. while pos < c.buf.len:
  1515. add(tok.literal, c.buf[pos])
  1516. inc(pos)
  1517. if pos < c.buf.len and c.buf[pos] notin strutils.IdentChars: break
  1518. c.bufpos = pos
  1519. tok.kind = tkIdentifier
  1520. proc getBuiltin(c: var PegLexer, tok: var Token) =
  1521. if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters:
  1522. inc(c.bufpos)
  1523. getSymbol(c, tok)
  1524. tok.kind = tkBuiltin
  1525. else:
  1526. tok.kind = tkEscaped
  1527. getEscapedChar(c, tok) # may set tok.kind to tkInvalid
  1528. proc getTok(c: var PegLexer, tok: var Token) =
  1529. tok.kind = tkInvalid
  1530. tok.modifier = modNone
  1531. setLen(tok.literal, 0)
  1532. skip(c)
  1533. if c.bufpos >= c.buf.len:
  1534. tok.kind = tkEof
  1535. tok.literal = "[EOF]"
  1536. add(tok.literal, '\0')
  1537. inc(c.bufpos)
  1538. return
  1539. case c.buf[c.bufpos]
  1540. of '{':
  1541. inc(c.bufpos)
  1542. if c.buf[c.bufpos] == '@' and c.bufpos+2 < c.buf.len and
  1543. c.buf[c.bufpos+1] == '}':
  1544. tok.kind = tkCurlyAt
  1545. inc(c.bufpos, 2)
  1546. add(tok.literal, "{@}")
  1547. else:
  1548. tok.kind = tkCurlyLe
  1549. add(tok.literal, '{')
  1550. of '}':
  1551. tok.kind = tkCurlyRi
  1552. inc(c.bufpos)
  1553. add(tok.literal, '}')
  1554. of '[':
  1555. getCharSet(c, tok)
  1556. of '(':
  1557. tok.kind = tkParLe
  1558. inc(c.bufpos)
  1559. add(tok.literal, '(')
  1560. of ')':
  1561. tok.kind = tkParRi
  1562. inc(c.bufpos)
  1563. add(tok.literal, ')')
  1564. of '.':
  1565. tok.kind = tkAny
  1566. inc(c.bufpos)
  1567. add(tok.literal, '.')
  1568. of '_':
  1569. tok.kind = tkAnyRune
  1570. inc(c.bufpos)
  1571. add(tok.literal, '_')
  1572. of '\\':
  1573. getBuiltin(c, tok)
  1574. of '\'', '"': getString(c, tok)
  1575. of '$': getDollar(c, tok)
  1576. of 'a'..'z', 'A'..'Z', '\128'..'\255':
  1577. getSymbol(c, tok)
  1578. if c.bufpos >= c.buf.len:
  1579. return
  1580. if c.buf[c.bufpos] in {'\'', '"'} or
  1581. c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and
  1582. c.buf[c.bufpos+1] in {'0'..'9'}:
  1583. case tok.literal
  1584. of "i": tok.modifier = modIgnoreCase
  1585. of "y": tok.modifier = modIgnoreStyle
  1586. of "v": tok.modifier = modVerbatim
  1587. else: discard
  1588. setLen(tok.literal, 0)
  1589. if c.buf[c.bufpos] == '$':
  1590. getDollar(c, tok)
  1591. else:
  1592. getString(c, tok)
  1593. if tok.modifier == modNone: tok.kind = tkInvalid
  1594. of '+':
  1595. tok.kind = tkPlus
  1596. inc(c.bufpos)
  1597. add(tok.literal, '+')
  1598. of '*':
  1599. tok.kind = tkStar
  1600. inc(c.bufpos)
  1601. add(tok.literal, '+')
  1602. of '<':
  1603. if c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '-':
  1604. inc(c.bufpos, 2)
  1605. tok.kind = tkArrow
  1606. add(tok.literal, "<-")
  1607. else:
  1608. add(tok.literal, '<')
  1609. of '/':
  1610. tok.kind = tkBar
  1611. inc(c.bufpos)
  1612. add(tok.literal, '/')
  1613. of '?':
  1614. tok.kind = tkOption
  1615. inc(c.bufpos)
  1616. add(tok.literal, '?')
  1617. of '!':
  1618. tok.kind = tkNot
  1619. inc(c.bufpos)
  1620. add(tok.literal, '!')
  1621. of '&':
  1622. tok.kind = tkAmp
  1623. inc(c.bufpos)
  1624. add(tok.literal, '!')
  1625. of '@':
  1626. tok.kind = tkAt
  1627. inc(c.bufpos)
  1628. add(tok.literal, '@')
  1629. if c.buf[c.bufpos] == '@':
  1630. tok.kind = tkCurlyAt
  1631. inc(c.bufpos)
  1632. add(tok.literal, '@')
  1633. of '^':
  1634. tok.kind = tkHat
  1635. inc(c.bufpos)
  1636. add(tok.literal, '^')
  1637. else:
  1638. if c.bufpos >= c.buf.len:
  1639. tok.kind = tkEof
  1640. tok.literal = "[EOF]"
  1641. add(tok.literal, c.buf[c.bufpos])
  1642. inc(c.bufpos)
  1643. proc arrowIsNextTok(c: PegLexer): bool =
  1644. # the only look ahead we need
  1645. var pos = c.bufpos
  1646. while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos)
  1647. if pos+1 >= c.buf.len:
  1648. return
  1649. result = c.buf[pos] == '<' and c.buf[pos+1] == '-'
  1650. # ----------------------------- parser ----------------------------------------
  1651. type
  1652. EInvalidPeg* = object of ValueError ## raised if an invalid
  1653. ## PEG has been detected
  1654. PegParser = object of PegLexer ## the PEG parser object
  1655. tok: Token
  1656. nonterms: seq[NonTerminal]
  1657. modifier: Modifier
  1658. captures: int
  1659. identIsVerbatim: bool
  1660. skip: Peg
  1661. proc pegError(p: PegParser, msg: string, line = -1, col = -1) =
  1662. var e: ref EInvalidPeg
  1663. new(e)
  1664. e.msg = errorStr(p, msg, line, col)
  1665. raise e
  1666. proc getTok(p: var PegParser) =
  1667. getTok(p, p.tok)
  1668. if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token")
  1669. proc eat(p: var PegParser, kind: TokKind) =
  1670. if p.tok.kind == kind: getTok(p)
  1671. else: pegError(p, tokKindToStr[kind] & " expected")
  1672. proc parseExpr(p: var PegParser): Peg {.gcsafe.}
  1673. proc getNonTerminal(p: var PegParser, name: string): NonTerminal =
  1674. for i in 0..high(p.nonterms):
  1675. result = p.nonterms[i]
  1676. if cmpIgnoreStyle(result.name, name) == 0: return
  1677. # forward reference:
  1678. result = newNonTerminal(name, getLine(p), getColumn(p))
  1679. add(p.nonterms, result)
  1680. proc modifiedTerm(s: string, m: Modifier): Peg =
  1681. case m
  1682. of modNone, modVerbatim: result = term(s)
  1683. of modIgnoreCase: result = termIgnoreCase(s)
  1684. of modIgnoreStyle: result = termIgnoreStyle(s)
  1685. proc modifiedBackref(s: int, m: Modifier): Peg =
  1686. case m
  1687. of modNone, modVerbatim: result = backref(s)
  1688. of modIgnoreCase: result = backrefIgnoreCase(s)
  1689. of modIgnoreStyle: result = backrefIgnoreStyle(s)
  1690. proc builtin(p: var PegParser): Peg =
  1691. # do not use "y", "skip" or "i" as these would be ambiguous
  1692. case p.tok.literal
  1693. of "n": result = newLine()
  1694. of "d": result = charSet({'0'..'9'})
  1695. of "D": result = charSet({'\1'..'\xff'} - {'0'..'9'})
  1696. of "s": result = charSet({' ', '\9'..'\13'})
  1697. of "S": result = charSet({'\1'..'\xff'} - {' ', '\9'..'\13'})
  1698. of "w": result = charSet({'a'..'z', 'A'..'Z', '_', '0'..'9'})
  1699. of "W": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_', '0'..'9'})
  1700. of "a": result = charSet({'a'..'z', 'A'..'Z'})
  1701. of "A": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'})
  1702. of "ident": result = pegs.ident
  1703. of "letter": result = unicodeLetter()
  1704. of "upper": result = unicodeUpper()
  1705. of "lower": result = unicodeLower()
  1706. of "title": result = unicodeTitle()
  1707. of "white": result = unicodeWhitespace()
  1708. else: pegError(p, "unknown built-in: " & p.tok.literal)
  1709. proc token(terminal: Peg, p: PegParser): Peg =
  1710. if p.skip.kind == pkEmpty: result = terminal
  1711. else: result = sequence(p.skip, terminal)
  1712. proc primary(p: var PegParser): Peg =
  1713. case p.tok.kind
  1714. of tkAmp:
  1715. getTok(p)
  1716. return &primary(p)
  1717. of tkNot:
  1718. getTok(p)
  1719. return !primary(p)
  1720. of tkAt:
  1721. getTok(p)
  1722. return !*primary(p)
  1723. of tkCurlyAt:
  1724. getTok(p)
  1725. return !*\primary(p).token(p)
  1726. else: discard
  1727. case p.tok.kind
  1728. of tkIdentifier:
  1729. if p.identIsVerbatim:
  1730. var m = p.tok.modifier
  1731. if m == modNone: m = p.modifier
  1732. result = modifiedTerm(p.tok.literal, m).token(p)
  1733. getTok(p)
  1734. elif not arrowIsNextTok(p):
  1735. var nt = getNonTerminal(p, p.tok.literal)
  1736. incl(nt.flags, ntUsed)
  1737. result = nonterminal(nt).token(p)
  1738. getTok(p)
  1739. else:
  1740. pegError(p, "expression expected, but found: " & p.tok.literal)
  1741. of tkStringLit:
  1742. var m = p.tok.modifier
  1743. if m == modNone: m = p.modifier
  1744. result = modifiedTerm(p.tok.literal, m).token(p)
  1745. getTok(p)
  1746. of tkCharSet:
  1747. if '\0' in p.tok.charset:
  1748. pegError(p, "binary zero ('\\0') not allowed in character class")
  1749. result = charSet(p.tok.charset).token(p)
  1750. getTok(p)
  1751. of tkParLe:
  1752. getTok(p)
  1753. result = parseExpr(p)
  1754. eat(p, tkParRi)
  1755. of tkCurlyLe:
  1756. getTok(p)
  1757. result = capture(parseExpr(p)).token(p)
  1758. eat(p, tkCurlyRi)
  1759. inc(p.captures)
  1760. of tkAny:
  1761. result = any().token(p)
  1762. getTok(p)
  1763. of tkAnyRune:
  1764. result = anyRune().token(p)
  1765. getTok(p)
  1766. of tkBuiltin:
  1767. result = builtin(p).token(p)
  1768. getTok(p)
  1769. of tkEscaped:
  1770. result = term(p.tok.literal[0]).token(p)
  1771. getTok(p)
  1772. of tkDollar:
  1773. result = endAnchor()
  1774. getTok(p)
  1775. of tkHat:
  1776. result = startAnchor()
  1777. getTok(p)
  1778. of tkBackref:
  1779. var m = p.tok.modifier
  1780. if m == modNone: m = p.modifier
  1781. result = modifiedBackref(p.tok.index, m).token(p)
  1782. if p.tok.index < 0 or p.tok.index > p.captures:
  1783. pegError(p, "invalid back reference index: " & $p.tok.index)
  1784. getTok(p)
  1785. else:
  1786. pegError(p, "expression expected, but found: " & p.tok.literal)
  1787. getTok(p) # we must consume a token here to prevent endless loops!
  1788. while true:
  1789. case p.tok.kind
  1790. of tkOption:
  1791. result = ?result
  1792. getTok(p)
  1793. of tkStar:
  1794. result = *result
  1795. getTok(p)
  1796. of tkPlus:
  1797. result = +result
  1798. getTok(p)
  1799. else: break
  1800. proc seqExpr(p: var PegParser): Peg =
  1801. result = primary(p)
  1802. while true:
  1803. case p.tok.kind
  1804. of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe,
  1805. tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
  1806. tkHat, tkCurlyAt:
  1807. result = sequence(result, primary(p))
  1808. of tkIdentifier:
  1809. if not arrowIsNextTok(p):
  1810. result = sequence(result, primary(p))
  1811. else: break
  1812. else: break
  1813. proc parseExpr(p: var PegParser): Peg =
  1814. result = seqExpr(p)
  1815. while p.tok.kind == tkBar:
  1816. getTok(p)
  1817. result = result / seqExpr(p)
  1818. proc parseRule(p: var PegParser): NonTerminal =
  1819. if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
  1820. result = getNonTerminal(p, p.tok.literal)
  1821. if ntDeclared in result.flags:
  1822. pegError(p, "attempt to redefine: " & result.name)
  1823. result.line = getLine(p)
  1824. result.col = getColumn(p)
  1825. getTok(p)
  1826. eat(p, tkArrow)
  1827. result.rule = parseExpr(p)
  1828. incl(result.flags, ntDeclared) # NOW inlining may be attempted
  1829. else:
  1830. pegError(p, "rule expected, but found: " & p.tok.literal)
  1831. proc rawParse(p: var PegParser): Peg =
  1832. ## parses a rule or a PEG expression
  1833. while p.tok.kind == tkBuiltin:
  1834. case p.tok.literal
  1835. of "i":
  1836. p.modifier = modIgnoreCase
  1837. getTok(p)
  1838. of "y":
  1839. p.modifier = modIgnoreStyle
  1840. getTok(p)
  1841. of "skip":
  1842. getTok(p)
  1843. p.skip = ?primary(p)
  1844. else: break
  1845. if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
  1846. result = parseRule(p).rule
  1847. while p.tok.kind != tkEof:
  1848. discard parseRule(p)
  1849. else:
  1850. p.identIsVerbatim = true
  1851. result = parseExpr(p)
  1852. if p.tok.kind != tkEof:
  1853. pegError(p, "EOF expected, but found: " & p.tok.literal)
  1854. for i in 0..high(p.nonterms):
  1855. var nt = p.nonterms[i]
  1856. if ntDeclared notin nt.flags:
  1857. pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col)
  1858. elif ntUsed notin nt.flags and i > 0:
  1859. pegError(p, "unused rule: " & nt.name, nt.line, nt.col)
  1860. proc parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg =
  1861. ## constructs a Peg object from `pattern`. `filename`, `line`, `col` are
  1862. ## used for error messages, but they only provide start offsets. `parsePeg`
  1863. ## keeps track of line and column numbers within `pattern`.
  1864. var p: PegParser
  1865. init(PegLexer(p), pattern, filename, line, col)
  1866. p.tok.kind = tkInvalid
  1867. p.tok.modifier = modNone
  1868. p.tok.literal = ""
  1869. p.tok.charset = {}
  1870. p.nonterms = @[]
  1871. p.identIsVerbatim = false
  1872. getTok(p)
  1873. result = rawParse(p)
  1874. proc peg*(pattern: string): Peg =
  1875. ## constructs a Peg object from the `pattern`. The short name has been
  1876. ## chosen to encourage its use as a raw string modifier::
  1877. ##
  1878. ## peg"{\ident} \s* '=' \s* {.*}"
  1879. result = parsePeg(pattern, "pattern")
  1880. proc escapePeg*(s: string): string =
  1881. ## escapes `s` so that it is matched verbatim when used as a peg.
  1882. result = ""
  1883. var inQuote = false
  1884. for c in items(s):
  1885. case c
  1886. of '\0'..'\31', '\'', '"', '\\':
  1887. if inQuote:
  1888. result.add('\'')
  1889. inQuote = false
  1890. result.add("\\x")
  1891. result.add(toHex(ord(c), 2))
  1892. else:
  1893. if not inQuote:
  1894. result.add('\'')
  1895. inQuote = true
  1896. result.add(c)
  1897. if inQuote: result.add('\'')
  1898. when isMainModule:
  1899. proc pegsTest() =
  1900. assert escapePeg("abc''def'") == r"'abc'\x27\x27'def'\x27"
  1901. assert match("(a b c)", peg"'(' @ ')'")
  1902. assert match("W_HI_Le", peg"\y 'while'")
  1903. assert(not match("W_HI_L", peg"\y 'while'"))
  1904. assert(not match("W_HI_Le", peg"\y v'while'"))
  1905. assert match("W_HI_Le", peg"y'while'")
  1906. assert($ +digits == $peg"\d+")
  1907. assert "0158787".match(peg"\d+")
  1908. assert "ABC 0232".match(peg"\w+\s+\d+")
  1909. assert "ABC".match(peg"\d+ / \w+")
  1910. var accum: seq[string] = @[]
  1911. for word in split("00232this02939is39an22example111", peg"\d+"):
  1912. accum.add(word)
  1913. assert(accum == @["this", "is", "an", "example"])
  1914. assert matchLen("key", ident) == 3
  1915. var pattern = sequence(ident, *whitespace, term('='), *whitespace, ident)
  1916. assert matchLen("key1= cal9", pattern) == 11
  1917. var ws = newNonTerminal("ws", 1, 1)
  1918. ws.rule = *whitespace
  1919. var expr = newNonTerminal("expr", 1, 1)
  1920. expr.rule = sequence(capture(ident), *sequence(
  1921. nonterminal(ws), term('+'), nonterminal(ws), nonterminal(expr)))
  1922. var c: Captures
  1923. var s = "a+b + c +d+e+f"
  1924. assert rawMatch(s, expr.rule, 0, c) == len(s)
  1925. var a = ""
  1926. for i in 0..c.ml-1:
  1927. a.add(substr(s, c.matches[i][0], c.matches[i][1]))
  1928. assert a == "abcdef"
  1929. #echo expr.rule
  1930. #const filename = "lib/devel/peg/grammar.txt"
  1931. #var grammar = parsePeg(newFileStream(filename, fmRead), filename)
  1932. #echo "a <- [abc]*?".match(grammar)
  1933. assert find("_____abc_______", term("abc"), 2) == 5
  1934. assert match("_______ana", peg"A <- 'ana' / . A")
  1935. assert match("abcs%%%", peg"A <- ..A / .A / '%'")
  1936. var matches: array[0..MaxSubpatterns-1, string]
  1937. if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}":
  1938. assert matches[0] == "abc"
  1939. else:
  1940. assert false
  1941. var g2 = peg"""S <- A B / C D
  1942. A <- 'a'+
  1943. B <- 'b'+
  1944. C <- 'c'+
  1945. D <- 'd'+
  1946. """
  1947. assert($g2 == "((A B) / (C D))")
  1948. assert match("cccccdddddd", g2)
  1949. assert("var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2") ==
  1950. "var1<-keykey; var2<-key2key2")
  1951. assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") ==
  1952. "$1<-$2$2; $1<-$2$2")
  1953. assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}")
  1954. if "aaaaaa" =~ peg"'aa' !. / ({'a'})+":
  1955. assert matches[0] == "a"
  1956. else:
  1957. assert false
  1958. if match("abcdefg", peg"c {d} ef {g}", matches, 2):
  1959. assert matches[0] == "d"
  1960. assert matches[1] == "g"
  1961. else:
  1962. assert false
  1963. accum = @[]
  1964. for x in findAll("abcdef", peg".", 3):
  1965. accum.add(x)
  1966. assert(accum == @["d", "e", "f"])
  1967. for x in findAll("abcdef", peg"^{.}", 3):
  1968. assert x == "d"
  1969. if "f(a, b)" =~ peg"{[0-9]+} / ({\ident} '(' {@} ')')":
  1970. assert matches[0] == "f"
  1971. assert matches[1] == "a, b"
  1972. else:
  1973. assert false
  1974. assert match("eine übersicht und außerdem", peg"(\letter \white*)+")
  1975. # ß is not a lower cased letter?!
  1976. assert match("eine übersicht und auerdem", peg"(\lower \white*)+")
  1977. assert match("EINE ÜBERSICHT UND AUSSERDEM", peg"(\upper \white*)+")
  1978. assert(not match("456678", peg"(\letter)+"))
  1979. assert("var1 = key; var2 = key2".replacef(
  1980. peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") ==
  1981. "var1<-keykey;var2<-key2key2")
  1982. assert match("prefix/start", peg"^start$", 7)
  1983. if "foo" =~ peg"{'a'}?.*":
  1984. assert matches[0].len == 0
  1985. else: assert false
  1986. if "foo" =~ peg"{''}.*":
  1987. assert matches[0] == ""
  1988. else: assert false
  1989. if "foo" =~ peg"{'foo'}":
  1990. assert matches[0] == "foo"
  1991. else: assert false
  1992. let empty_test = peg"^\d*"
  1993. let str = "XYZ"
  1994. assert(str.find(empty_test) == 0)
  1995. assert(str.match(empty_test))
  1996. proc handleMatches(m: int, n: int, c: openArray[string]): string =
  1997. result = ""
  1998. if m > 0:
  1999. result.add ", "
  2000. result.add case n:
  2001. of 2: toLowerAscii(c[0]) & ": '" & c[1] & "'"
  2002. of 1: toLowerAscii(c[0]) & ": ''"
  2003. else: ""
  2004. assert("Var1=key1;var2=Key2; VAR3".
  2005. replace(peg"{\ident}('='{\ident})* ';'* \s*",
  2006. handleMatches) == "var1: 'key1', var2: 'Key2', var3: ''")
  2007. doAssert "test1".match(peg"""{@}$""")
  2008. doAssert "test2".match(peg"""{(!$ .)*} $""")
  2009. pegsTest()
  2010. static:
  2011. pegsTest()