pegs.nim 64 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but
  10. ## uses superoperators and symbol inlining to improve performance. Note:
  11. ## Matching performance is hopefully competitive with optimized regular
  12. ## expression engines.
  13. ##
  14. ## .. include:: ../../doc/pegdocs.txt
  15. ##
  16. include "system/inclrtl"
  17. const
  18. useUnicode = true ## change this to deactivate proper UTF-8 support
  19. import strutils, macros
  20. import std/private/decode_helpers
  21. when useUnicode:
  22. import unicode
  23. export unicode.`==`
  24. const
  25. InlineThreshold = 5 ## number of leaves; -1 to disable inlining
  26. MaxSubpatterns* = 20 ## defines the maximum number of subpatterns that
  27. ## can be captured. More subpatterns cannot be captured!
  28. type
  29. PegKind* = enum
  30. pkEmpty,
  31. pkAny, ## any character (.)
  32. pkAnyRune, ## any Unicode character (_)
  33. pkNewLine, ## CR-LF, LF, CR
  34. pkLetter, ## Unicode letter
  35. pkLower, ## Unicode lower case letter
  36. pkUpper, ## Unicode upper case letter
  37. pkTitle, ## Unicode title character
  38. pkWhitespace, ## Unicode whitespace character
  39. pkTerminal,
  40. pkTerminalIgnoreCase,
  41. pkTerminalIgnoreStyle,
  42. pkChar, ## single character to match
  43. pkCharChoice,
  44. pkNonTerminal,
  45. pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c)
  46. pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c]
  47. pkGreedyRep, ## a* --> Internal DSL: *a
  48. ## a+ --> (a a*)
  49. pkGreedyRepChar, ## x* where x is a single character (superop)
  50. pkGreedyRepSet, ## [set]* (superop)
  51. pkGreedyAny, ## .* or _* (superop)
  52. pkOption, ## a? --> Internal DSL: ?a
  53. pkAndPredicate, ## &a --> Internal DSL: &a
  54. pkNotPredicate, ## !a --> Internal DSL: !a
  55. pkCapture, ## {a} --> Internal DSL: capture(a)
  56. pkBackRef, ## $i --> Internal DSL: backref(i)
  57. pkBackRefIgnoreCase,
  58. pkBackRefIgnoreStyle,
  59. pkSearch, ## @a --> Internal DSL: !*a
  60. pkCapturedSearch, ## {@} a --> Internal DSL: !*\a
  61. pkRule, ## a <- b
  62. pkList, ## a, b
  63. pkStartAnchor ## ^ --> Internal DSL: startAnchor()
  64. NonTerminalFlag* = enum
  65. ntDeclared, ntUsed
  66. NonTerminalObj = object ## represents a non terminal symbol
  67. name: string ## the name of the symbol
  68. line: int ## line the symbol has been declared/used in
  69. col: int ## column the symbol has been declared/used in
  70. flags: set[NonTerminalFlag] ## the nonterminal's flags
  71. rule: Peg ## the rule that the symbol refers to
  72. Peg* {.shallow.} = object ## type that represents a PEG
  73. case kind: PegKind
  74. of pkEmpty..pkWhitespace: nil
  75. of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string
  76. of pkChar, pkGreedyRepChar: ch: char
  77. of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
  78. of pkNonTerminal: nt: NonTerminal
  79. of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1]
  80. else: sons: seq[Peg]
  81. NonTerminal* = ref NonTerminalObj
  82. func kind*(p: Peg): PegKind = p.kind
  83. ## Returns the *PegKind* of a given *Peg* object.
  84. func term*(p: Peg): string = p.term
  85. ## Returns the *string* representation of a given *Peg* variant object
  86. ## where present.
  87. func ch*(p: Peg): char = p.ch
  88. ## Returns the *char* representation of a given *Peg* variant object
  89. ## where present.
  90. func charChoice*(p: Peg): ref set[char] = p.charChoice
  91. ## Returns the *charChoice* field of a given *Peg* variant object
  92. ## where present.
  93. func nt*(p: Peg): NonTerminal = p.nt
  94. ## Returns the *NonTerminal* object of a given *Peg* variant object
  95. ## where present.
  96. func index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index
  97. ## Returns the back-reference index of a captured sub-pattern in the
  98. ## *Captures* object for a given *Peg* variant object where present.
  99. iterator items*(p: Peg): Peg {.inline.} =
  100. ## Yields the child nodes of a *Peg* variant object where present.
  101. for s in p.sons:
  102. yield s
  103. iterator pairs*(p: Peg): (int, Peg) {.inline.} =
  104. ## Yields the indices and child nodes of a *Peg* variant object where present.
  105. for i in 0 ..< p.sons.len:
  106. yield (i, p.sons[i])
  107. func name*(nt: NonTerminal): string = nt.name
  108. ## Gets the name of the symbol represented by the parent *Peg* object variant
  109. ## of a given *NonTerminal*.
  110. func line*(nt: NonTerminal): int = nt.line
  111. ## Gets the line number of the definition of the parent *Peg* object variant
  112. ## of a given *NonTerminal*.
  113. func col*(nt: NonTerminal): int = nt.col
  114. ## Gets the column number of the definition of the parent *Peg* object variant
  115. ## of a given *NonTerminal*.
  116. func flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags
  117. ## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant
  118. ## object of a given *NonTerminal*.
  119. func rule*(nt: NonTerminal): Peg = nt.rule
  120. ## Gets the *Peg* object representing the rule definition of the parent *Peg*
  121. ## object variant of a given *NonTerminal*.
  122. func term*(t: string): Peg {.rtl, extern: "npegs$1Str".} =
  123. ## constructs a PEG from a terminal string
  124. if t.len != 1:
  125. result = Peg(kind: pkTerminal, term: t)
  126. else:
  127. result = Peg(kind: pkChar, ch: t[0])
  128. func termIgnoreCase*(t: string): Peg {.
  129. rtl, extern: "npegs$1".} =
  130. ## constructs a PEG from a terminal string; ignore case for matching
  131. result = Peg(kind: pkTerminalIgnoreCase, term: t)
  132. func termIgnoreStyle*(t: string): Peg {.
  133. rtl, extern: "npegs$1".} =
  134. ## constructs a PEG from a terminal string; ignore style for matching
  135. result = Peg(kind: pkTerminalIgnoreStyle, term: t)
  136. func term*(t: char): Peg {.rtl, extern: "npegs$1Char".} =
  137. ## constructs a PEG from a terminal char
  138. assert t != '\0'
  139. result = Peg(kind: pkChar, ch: t)
  140. func charSet*(s: set[char]): Peg {.rtl, extern: "npegs$1".} =
  141. ## constructs a PEG from a character set `s`
  142. assert '\0' notin s
  143. result = Peg(kind: pkCharChoice)
  144. new(result.charChoice)
  145. result.charChoice[] = s
  146. func len(a: Peg): int {.inline.} = return a.sons.len
  147. func add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s)
  148. func addChoice(dest: var Peg, elem: Peg) =
  149. var L = dest.len-1
  150. if L >= 0 and dest.sons[L].kind == pkCharChoice:
  151. # caution! Do not introduce false aliasing here!
  152. case elem.kind
  153. of pkCharChoice:
  154. dest.sons[L] = charSet(dest.sons[L].charChoice[] + elem.charChoice[])
  155. of pkChar:
  156. dest.sons[L] = charSet(dest.sons[L].charChoice[] + {elem.ch})
  157. else: add(dest, elem)
  158. else: add(dest, elem)
  159. template multipleOp(k: PegKind, localOpt: untyped) =
  160. result = Peg(kind: k, sons: @[])
  161. for x in items(a):
  162. if x.kind == k:
  163. for y in items(x.sons):
  164. localOpt(result, y)
  165. else:
  166. localOpt(result, x)
  167. if result.len == 1:
  168. result = result.sons[0]
  169. func `/`*(a: varargs[Peg]): Peg {.
  170. rtl, extern: "npegsOrderedChoice".} =
  171. ## constructs an ordered choice with the PEGs in `a`
  172. multipleOp(pkOrderedChoice, addChoice)
  173. func addSequence(dest: var Peg, elem: Peg) =
  174. var L = dest.len-1
  175. if L >= 0 and dest.sons[L].kind == pkTerminal:
  176. # caution! Do not introduce false aliasing here!
  177. case elem.kind
  178. of pkTerminal:
  179. dest.sons[L] = term(dest.sons[L].term & elem.term)
  180. of pkChar:
  181. dest.sons[L] = term(dest.sons[L].term & elem.ch)
  182. else: add(dest, elem)
  183. else: add(dest, elem)
  184. func sequence*(a: varargs[Peg]): Peg {.
  185. rtl, extern: "npegs$1".} =
  186. ## constructs a sequence with all the PEGs from `a`
  187. multipleOp(pkSequence, addSequence)
  188. func `?`*(a: Peg): Peg {.rtl, extern: "npegsOptional".} =
  189. ## constructs an optional for the PEG `a`
  190. if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar,
  191. pkGreedyRepSet}:
  192. # a* ? --> a*
  193. # a? ? --> a?
  194. result = a
  195. else:
  196. result = Peg(kind: pkOption, sons: @[a])
  197. func `*`*(a: Peg): Peg {.rtl, extern: "npegsGreedyRep".} =
  198. ## constructs a "greedy repetition" for the PEG `a`
  199. case a.kind
  200. of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption:
  201. assert false
  202. # produces endless loop!
  203. of pkChar:
  204. result = Peg(kind: pkGreedyRepChar, ch: a.ch)
  205. of pkCharChoice:
  206. result = Peg(kind: pkGreedyRepSet, charChoice: a.charChoice)
  207. of pkAny, pkAnyRune:
  208. result = Peg(kind: pkGreedyAny)
  209. else:
  210. result = Peg(kind: pkGreedyRep, sons: @[a])
  211. func `!*`*(a: Peg): Peg {.rtl, extern: "npegsSearch".} =
  212. ## constructs a "search" for the PEG `a`
  213. result = Peg(kind: pkSearch, sons: @[a])
  214. func `!*\`*(a: Peg): Peg {.rtl,
  215. extern: "npgegsCapturedSearch".} =
  216. ## constructs a "captured search" for the PEG `a`
  217. result = Peg(kind: pkCapturedSearch, sons: @[a])
  218. func `+`*(a: Peg): Peg {.rtl, extern: "npegsGreedyPosRep".} =
  219. ## constructs a "greedy positive repetition" with the PEG `a`
  220. return sequence(a, *a)
  221. func `&`*(a: Peg): Peg {.rtl, extern: "npegsAndPredicate".} =
  222. ## constructs an "and predicate" with the PEG `a`
  223. result = Peg(kind: pkAndPredicate, sons: @[a])
  224. func `!`*(a: Peg): Peg {.rtl, extern: "npegsNotPredicate".} =
  225. ## constructs a "not predicate" with the PEG `a`
  226. result = Peg(kind: pkNotPredicate, sons: @[a])
  227. func any*: Peg {.inline.} =
  228. ## constructs the PEG `any character`:idx: (``.``)
  229. result = Peg(kind: pkAny)
  230. func anyRune*: Peg {.inline.} =
  231. ## constructs the PEG `any rune`:idx: (``_``)
  232. result = Peg(kind: pkAnyRune)
  233. func newLine*: Peg {.inline.} =
  234. ## constructs the PEG `newline`:idx: (``\n``)
  235. result = Peg(kind: pkNewLine)
  236. func unicodeLetter*: Peg {.inline.} =
  237. ## constructs the PEG ``\letter`` which matches any Unicode letter.
  238. result = Peg(kind: pkLetter)
  239. func unicodeLower*: Peg {.inline.} =
  240. ## constructs the PEG ``\lower`` which matches any Unicode lowercase letter.
  241. result = Peg(kind: pkLower)
  242. func unicodeUpper*: Peg {.inline.} =
  243. ## constructs the PEG ``\upper`` which matches any Unicode uppercase letter.
  244. result = Peg(kind: pkUpper)
  245. func unicodeTitle*: Peg {.inline.} =
  246. ## constructs the PEG ``\title`` which matches any Unicode title letter.
  247. result = Peg(kind: pkTitle)
  248. func unicodeWhitespace*: Peg {.inline.} =
  249. ## constructs the PEG ``\white`` which matches any Unicode
  250. ## whitespace character.
  251. result = Peg(kind: pkWhitespace)
  252. func startAnchor*: Peg {.inline.} =
  253. ## constructs the PEG ``^`` which matches the start of the input.
  254. result = Peg(kind: pkStartAnchor)
  255. func endAnchor*: Peg {.inline.} =
  256. ## constructs the PEG ``$`` which matches the end of the input.
  257. result = !any()
  258. func capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.rtl, extern: "npegsCapture".} =
  259. ## constructs a capture with the PEG `a`
  260. result = Peg(kind: pkCapture, sons: @[a])
  261. func backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
  262. rtl, extern: "npegs$1".} =
  263. ## constructs a back reference of the given `index`. `index` starts counting
  264. ## from 1. `reverse` specifies wether indexing starts from the end of the
  265. ## capture list.
  266. result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1))
  267. func backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
  268. rtl, extern: "npegs$1".} =
  269. ## constructs a back reference of the given `index`. `index` starts counting
  270. ## from 1. `reverse` specifies wether indexing starts from the end of the
  271. ## capture list. Ignores case for matching.
  272. result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1))
  273. func backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
  274. rtl, extern: "npegs$1".} =
  275. ## constructs a back reference of the given `index`. `index` starts counting
  276. ## from 1. `reverse` specifies wether indexing starts from the end of the
  277. ## capture list. Ignores style for matching.
  278. result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1))
  279. func spaceCost(n: Peg): int =
  280. case n.kind
  281. of pkEmpty: discard
  282. of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
  283. pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
  284. pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle:
  285. result = 1
  286. of pkNonTerminal:
  287. # we cannot inline a rule with a non-terminal
  288. result = InlineThreshold+1
  289. else:
  290. for i in 0..n.len-1:
  291. inc(result, spaceCost(n.sons[i]))
  292. if result >= InlineThreshold: break
  293. func nonterminal*(n: NonTerminal): Peg {.
  294. rtl, extern: "npegs$1".} =
  295. ## constructs a PEG that consists of the nonterminal symbol
  296. assert n != nil
  297. if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold:
  298. when false: echo "inlining symbol: ", n.name
  299. result = n.rule # inlining of rule enables better optimizations
  300. else:
  301. result = Peg(kind: pkNonTerminal, nt: n)
  302. func newNonTerminal*(name: string, line, column: int): NonTerminal {.
  303. rtl, extern: "npegs$1".} =
  304. ## constructs a nonterminal symbol
  305. result = NonTerminal(name: name, line: line, col: column)
  306. template letters*: Peg =
  307. ## expands to ``charset({'A'..'Z', 'a'..'z'})``
  308. charSet({'A'..'Z', 'a'..'z'})
  309. template digits*: Peg =
  310. ## expands to ``charset({'0'..'9'})``
  311. charSet({'0'..'9'})
  312. template whitespace*: Peg =
  313. ## expands to ``charset({' ', '\9'..'\13'})``
  314. charSet({' ', '\9'..'\13'})
  315. template identChars*: Peg =
  316. ## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})``
  317. charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'})
  318. template identStartChars*: Peg =
  319. ## expands to ``charset({'A'..'Z', 'a'..'z', '_'})``
  320. charSet({'a'..'z', 'A'..'Z', '_'})
  321. template ident*: Peg =
  322. ## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier
  323. sequence(charSet({'a'..'z', 'A'..'Z', '_'}),
  324. *charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'}))
  325. template natural*: Peg =
  326. ## same as ``\d+``
  327. +digits
  328. # ------------------------- debugging -----------------------------------------
  329. func esc(c: char, reserved = {'\0'..'\255'}): string =
  330. case c
  331. of '\b': result = "\\b"
  332. of '\t': result = "\\t"
  333. of '\c': result = "\\c"
  334. of '\L': result = "\\l"
  335. of '\v': result = "\\v"
  336. of '\f': result = "\\f"
  337. of '\e': result = "\\e"
  338. of '\a': result = "\\a"
  339. of '\\': result = "\\\\"
  340. of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
  341. elif c < ' ' or c >= '\127': result = '\\' & $ord(c)
  342. elif c in reserved: result = '\\' & c
  343. else: result = $c
  344. func singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'"
  345. func singleQuoteEsc(str: string): string =
  346. result = "'"
  347. for c in items(str): add result, esc(c, {'\''})
  348. add result, '\''
  349. func charSetEscAux(cc: set[char]): string =
  350. const reserved = {'^', '-', ']'}
  351. result = ""
  352. var c1 = 0
  353. while c1 <= 0xff:
  354. if chr(c1) in cc:
  355. var c2 = c1
  356. while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
  357. if c1 == c2:
  358. add result, esc(chr(c1), reserved)
  359. elif c2 == succ(c1):
  360. add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
  361. else:
  362. add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
  363. c1 = c2
  364. inc(c1)
  365. func charSetEsc(cc: set[char]): string =
  366. if card(cc) >= 128+64:
  367. result = "[^" & charSetEscAux({'\1'..'\xFF'} - cc) & ']'
  368. else:
  369. result = '[' & charSetEscAux(cc) & ']'
  370. func toStrAux(r: Peg, res: var string) =
  371. case r.kind
  372. of pkEmpty: add(res, "()")
  373. of pkAny: add(res, '.')
  374. of pkAnyRune: add(res, '_')
  375. of pkLetter: add(res, "\\letter")
  376. of pkLower: add(res, "\\lower")
  377. of pkUpper: add(res, "\\upper")
  378. of pkTitle: add(res, "\\title")
  379. of pkWhitespace: add(res, "\\white")
  380. of pkNewLine: add(res, "\\n")
  381. of pkTerminal: add(res, singleQuoteEsc(r.term))
  382. of pkTerminalIgnoreCase:
  383. add(res, 'i')
  384. add(res, singleQuoteEsc(r.term))
  385. of pkTerminalIgnoreStyle:
  386. add(res, 'y')
  387. add(res, singleQuoteEsc(r.term))
  388. of pkChar: add(res, singleQuoteEsc(r.ch))
  389. of pkCharChoice: add(res, charSetEsc(r.charChoice[]))
  390. of pkNonTerminal: add(res, r.nt.name)
  391. of pkSequence:
  392. add(res, '(')
  393. toStrAux(r.sons[0], res)
  394. for i in 1 .. high(r.sons):
  395. add(res, ' ')
  396. toStrAux(r.sons[i], res)
  397. add(res, ')')
  398. of pkOrderedChoice:
  399. add(res, '(')
  400. toStrAux(r.sons[0], res)
  401. for i in 1 .. high(r.sons):
  402. add(res, " / ")
  403. toStrAux(r.sons[i], res)
  404. add(res, ')')
  405. of pkGreedyRep:
  406. toStrAux(r.sons[0], res)
  407. add(res, '*')
  408. of pkGreedyRepChar:
  409. add(res, singleQuoteEsc(r.ch))
  410. add(res, '*')
  411. of pkGreedyRepSet:
  412. add(res, charSetEsc(r.charChoice[]))
  413. add(res, '*')
  414. of pkGreedyAny:
  415. add(res, ".*")
  416. of pkOption:
  417. toStrAux(r.sons[0], res)
  418. add(res, '?')
  419. of pkAndPredicate:
  420. add(res, '&')
  421. toStrAux(r.sons[0], res)
  422. of pkNotPredicate:
  423. add(res, '!')
  424. toStrAux(r.sons[0], res)
  425. of pkSearch:
  426. add(res, '@')
  427. toStrAux(r.sons[0], res)
  428. of pkCapturedSearch:
  429. add(res, "{@}")
  430. toStrAux(r.sons[0], res)
  431. of pkCapture:
  432. add(res, '{')
  433. toStrAux(r.sons[0], res)
  434. add(res, '}')
  435. of pkBackRef:
  436. add(res, '$')
  437. add(res, $r.index)
  438. of pkBackRefIgnoreCase:
  439. add(res, "i$")
  440. add(res, $r.index)
  441. of pkBackRefIgnoreStyle:
  442. add(res, "y$")
  443. add(res, $r.index)
  444. of pkRule:
  445. toStrAux(r.sons[0], res)
  446. add(res, " <- ")
  447. toStrAux(r.sons[1], res)
  448. of pkList:
  449. for i in 0 .. high(r.sons):
  450. toStrAux(r.sons[i], res)
  451. add(res, "\n")
  452. of pkStartAnchor:
  453. add(res, '^')
  454. func `$` *(r: Peg): string {.rtl, extern: "npegsToString".} =
  455. ## converts a PEG to its string representation
  456. result = ""
  457. toStrAux(r, result)
  458. # --------------------- core engine -------------------------------------------
  459. type
  460. Captures* = object ## contains the captured substrings.
  461. matches: array[0..MaxSubpatterns-1, tuple[first, last: int]]
  462. ml: int
  463. origStart: int
  464. func bounds*(c: Captures,
  465. i: range[0..MaxSubpatterns-1]): tuple[first, last: int] =
  466. ## returns the bounds ``[first..last]`` of the `i`'th capture.
  467. result = c.matches[i]
  468. when not useUnicode:
  469. type
  470. Rune = char
  471. template fastRuneAt(s, i, ch) =
  472. ch = s[i]
  473. inc(i)
  474. template runeLenAt(s, i): untyped = 1
  475. func isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'}
  476. func isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'}
  477. func isLower(a: char): bool {.inline.} = return a in {'a'..'z'}
  478. func isTitle(a: char): bool {.inline.} = return false
  479. func isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'}
  480. template matchOrParse(mopProc: untyped) =
  481. # Used to make the main matcher proc *rawMatch* as well as event parser
  482. # procs. For the former, *enter* and *leave* event handler code generators
  483. # are provided which just return *discard*.
  484. proc mopProc(s: string, p: Peg, start: int, c: var Captures): int {.gcsafe, raises: [].} =
  485. proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int =
  486. # Parse handler code must run in an *of* clause of its own for each
  487. # *PegKind*, so we encapsulate the identical clause body for
  488. # *pkBackRef..pkBackRefIgnoreStyle* here.
  489. var index = p.index
  490. if index < 0: index.inc(c.ml)
  491. if index < 0 or index >= c.ml: return -1
  492. var (a, b) = c.matches[index]
  493. var n: Peg
  494. case p.kind
  495. of pkBackRef:
  496. n = Peg(kind: pkTerminal, term: s.substr(a, b))
  497. of pkBackRefIgnoreStyle:
  498. n = Peg(kind: pkTerminalIgnoreStyle, term: s.substr(a, b))
  499. of pkBackRefIgnoreCase:
  500. n = Peg(kind: pkTerminalIgnoreCase, term: s.substr(a, b))
  501. else: assert(false, "impossible case")
  502. mopProc(s, n, start, c)
  503. case p.kind
  504. of pkEmpty:
  505. enter(pkEmpty, s, p, start)
  506. result = 0 # match of length 0
  507. leave(pkEmpty, s, p, start, result)
  508. of pkAny:
  509. enter(pkAny, s, p, start)
  510. if start < s.len: result = 1
  511. else: result = -1
  512. leave(pkAny, s, p, start, result)
  513. of pkAnyRune:
  514. enter(pkAnyRune, s, p, start)
  515. if start < s.len:
  516. result = runeLenAt(s, start)
  517. else:
  518. result = -1
  519. leave(pkAnyRune, s, p, start, result)
  520. of pkLetter:
  521. enter(pkLetter, s, p, start)
  522. if start < s.len:
  523. var a: Rune
  524. result = start
  525. fastRuneAt(s, result, a)
  526. if isAlpha(a): dec(result, start)
  527. else: result = -1
  528. else:
  529. result = -1
  530. leave(pkLetter, s, p, start, result)
  531. of pkLower:
  532. enter(pkLower, s, p, start)
  533. if start < s.len:
  534. var a: Rune
  535. result = start
  536. fastRuneAt(s, result, a)
  537. if isLower(a): dec(result, start)
  538. else: result = -1
  539. else:
  540. result = -1
  541. leave(pkLower, s, p, start, result)
  542. of pkUpper:
  543. enter(pkUpper, s, p, start)
  544. if start < s.len:
  545. var a: Rune
  546. result = start
  547. fastRuneAt(s, result, a)
  548. if isUpper(a): dec(result, start)
  549. else: result = -1
  550. else:
  551. result = -1
  552. leave(pkUpper, s, p, start, result)
  553. of pkTitle:
  554. enter(pkTitle, s, p, start)
  555. if start < s.len:
  556. var a: Rune
  557. result = start
  558. fastRuneAt(s, result, a)
  559. if isTitle(a): dec(result, start)
  560. else: result = -1
  561. else:
  562. result = -1
  563. leave(pkTitle, s, p, start, result)
  564. of pkWhitespace:
  565. enter(pkWhitespace, s, p, start)
  566. if start < s.len:
  567. var a: Rune
  568. result = start
  569. fastRuneAt(s, result, a)
  570. if isWhiteSpace(a): dec(result, start)
  571. else: result = -1
  572. else:
  573. result = -1
  574. leave(pkWhitespace, s, p, start, result)
  575. of pkGreedyAny:
  576. enter(pkGreedyAny, s, p, start)
  577. result = len(s) - start
  578. leave(pkGreedyAny, s, p, start, result)
  579. of pkNewLine:
  580. enter(pkNewLine, s, p, start)
  581. if start < s.len and s[start] == '\L': result = 1
  582. elif start < s.len and s[start] == '\C':
  583. if start+1 < s.len and s[start+1] == '\L': result = 2
  584. else: result = 1
  585. else: result = -1
  586. leave(pkNewLine, s, p, start, result)
  587. of pkTerminal:
  588. enter(pkTerminal, s, p, start)
  589. result = len(p.term)
  590. for i in 0..result-1:
  591. if start+i >= s.len or p.term[i] != s[start+i]:
  592. result = -1
  593. break
  594. leave(pkTerminal, s, p, start, result)
  595. of pkTerminalIgnoreCase:
  596. enter(pkTerminalIgnoreCase, s, p, start)
  597. var
  598. i = 0
  599. a, b: Rune
  600. result = start
  601. while i < len(p.term):
  602. if result >= s.len:
  603. result = -1
  604. break
  605. fastRuneAt(p.term, i, a)
  606. fastRuneAt(s, result, b)
  607. if toLower(a) != toLower(b):
  608. result = -1
  609. break
  610. dec(result, start)
  611. leave(pkTerminalIgnoreCase, s, p, start, result)
  612. of pkTerminalIgnoreStyle:
  613. enter(pkTerminalIgnoreStyle, s, p, start)
  614. var
  615. i = 0
  616. a, b: Rune
  617. result = start
  618. while i < len(p.term):
  619. while i < len(p.term):
  620. fastRuneAt(p.term, i, a)
  621. if a != Rune('_'): break
  622. while result < s.len:
  623. fastRuneAt(s, result, b)
  624. if b != Rune('_'): break
  625. if result >= s.len:
  626. if i >= p.term.len: break
  627. else:
  628. result = -1
  629. break
  630. elif toLower(a) != toLower(b):
  631. result = -1
  632. break
  633. dec(result, start)
  634. leave(pkTerminalIgnoreStyle, s, p, start, result)
  635. of pkChar:
  636. enter(pkChar, s, p, start)
  637. if start < s.len and p.ch == s[start]: result = 1
  638. else: result = -1
  639. leave(pkChar, s, p, start, result)
  640. of pkCharChoice:
  641. enter(pkCharChoice, s, p, start)
  642. if start < s.len and contains(p.charChoice[], s[start]): result = 1
  643. else: result = -1
  644. leave(pkCharChoice, s, p, start, result)
  645. of pkNonTerminal:
  646. enter(pkNonTerminal, s, p, start)
  647. var oldMl = c.ml
  648. when false: echo "enter: ", p.nt.name
  649. result = mopProc(s, p.nt.rule, start, c)
  650. when false: echo "leave: ", p.nt.name
  651. if result < 0: c.ml = oldMl
  652. leave(pkNonTerminal, s, p, start, result)
  653. of pkSequence:
  654. enter(pkSequence, s, p, start)
  655. var oldMl = c.ml
  656. result = 0
  657. for i in 0..high(p.sons):
  658. var x = mopProc(s, p.sons[i], start+result, c)
  659. if x < 0:
  660. c.ml = oldMl
  661. result = -1
  662. break
  663. else: inc(result, x)
  664. leave(pkSequence, s, p, start, result)
  665. of pkOrderedChoice:
  666. enter(pkOrderedChoice, s, p, start)
  667. var oldMl = c.ml
  668. for i in 0..high(p.sons):
  669. result = mopProc(s, p.sons[i], start, c)
  670. if result >= 0: break
  671. c.ml = oldMl
  672. leave(pkOrderedChoice, s, p, start, result)
  673. of pkSearch:
  674. enter(pkSearch, s, p, start)
  675. var oldMl = c.ml
  676. result = 0
  677. while start+result <= s.len:
  678. var x = mopProc(s, p.sons[0], start+result, c)
  679. if x >= 0:
  680. inc(result, x)
  681. leave(pkSearch, s, p, start, result)
  682. return
  683. inc(result)
  684. result = -1
  685. c.ml = oldMl
  686. leave(pkSearch, s, p, start, result)
  687. of pkCapturedSearch:
  688. enter(pkCapturedSearch, s, p, start)
  689. var idx = c.ml # reserve a slot for the subpattern
  690. inc(c.ml)
  691. result = 0
  692. while start+result <= s.len:
  693. var x = mopProc(s, p.sons[0], start+result, c)
  694. if x >= 0:
  695. if idx < MaxSubpatterns:
  696. c.matches[idx] = (start, start+result-1)
  697. #else: silently ignore the capture
  698. inc(result, x)
  699. leave(pkCapturedSearch, s, p, start, result)
  700. return
  701. inc(result)
  702. result = -1
  703. c.ml = idx
  704. leave(pkCapturedSearch, s, p, start, result)
  705. of pkGreedyRep:
  706. enter(pkGreedyRep, s, p, start)
  707. result = 0
  708. while true:
  709. var x = mopProc(s, p.sons[0], start+result, c)
  710. # if x == 0, we have an endless loop; so the correct behaviour would be
  711. # not to break. But endless loops can be easily introduced:
  712. # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the
  713. # expected thing in this case.
  714. if x <= 0: break
  715. inc(result, x)
  716. leave(pkGreedyRep, s, p, start, result)
  717. of pkGreedyRepChar:
  718. enter(pkGreedyRepChar, s, p, start)
  719. result = 0
  720. var ch = p.ch
  721. while start+result < s.len and ch == s[start+result]: inc(result)
  722. leave(pkGreedyRepChar, s, p, start, result)
  723. of pkGreedyRepSet:
  724. enter(pkGreedyRepSet, s, p, start)
  725. result = 0
  726. while start+result < s.len and contains(p.charChoice[], s[start+result]):
  727. inc(result)
  728. leave(pkGreedyRepSet, s, p, start, result)
  729. of pkOption:
  730. enter(pkOption, s, p, start)
  731. result = max(0, mopProc(s, p.sons[0], start, c))
  732. leave(pkOption, s, p, start, result)
  733. of pkAndPredicate:
  734. enter(pkAndPredicate, s, p, start)
  735. var oldMl = c.ml
  736. result = mopProc(s, p.sons[0], start, c)
  737. if result >= 0: result = 0 # do not consume anything
  738. else: c.ml = oldMl
  739. leave(pkAndPredicate, s, p, start, result)
  740. of pkNotPredicate:
  741. enter(pkNotPredicate, s, p, start)
  742. var oldMl = c.ml
  743. result = mopProc(s, p.sons[0], start, c)
  744. if result < 0: result = 0
  745. else:
  746. c.ml = oldMl
  747. result = -1
  748. leave(pkNotPredicate, s, p, start, result)
  749. of pkCapture:
  750. enter(pkCapture, s, p, start)
  751. if p.sons.len == 0 or p.sons[0].kind == pkEmpty:
  752. # empty capture removes last match
  753. dec(c.ml)
  754. c.matches[c.ml] = (0, 0)
  755. result = 0 # match of length 0
  756. else:
  757. var idx = c.ml # reserve a slot for the subpattern
  758. result = mopProc(s, p.sons[0], start, c)
  759. if result >= 0:
  760. if idx < MaxSubpatterns:
  761. if idx != c.ml:
  762. for i in countdown(c.ml, idx):
  763. c.matches[i+1] = c.matches[i]
  764. c.matches[idx] = (start, start+result-1)
  765. #else: silently ignore the capture
  766. inc(c.ml)
  767. leave(pkCapture, s, p, start, result)
  768. of pkBackRef:
  769. enter(pkBackRef, s, p, start)
  770. result = matchBackRef(s, p, start, c)
  771. leave(pkBackRef, s, p, start, result)
  772. of pkBackRefIgnoreCase:
  773. enter(pkBackRefIgnoreCase, s, p, start)
  774. result = matchBackRef(s, p, start, c)
  775. leave(pkBackRefIgnoreCase, s, p, start, result)
  776. of pkBackRefIgnoreStyle:
  777. enter(pkBackRefIgnoreStyle, s, p, start)
  778. result = matchBackRef(s, p, start, c)
  779. leave(pkBackRefIgnoreStyle, s, p, start, result)
  780. of pkStartAnchor:
  781. enter(pkStartAnchor, s, p, start)
  782. if c.origStart == start: result = 0
  783. else: result = -1
  784. leave(pkStartAnchor, s, p, start, result)
  785. of pkRule, pkList: assert false
  786. func rawMatch*(s: string, p: Peg, start: int, c: var Captures): int
  787. {.rtl, extern: "npegs$1".} =
  788. ## low-level matching proc that implements the PEG interpreter. Use this
  789. ## for maximum efficiency (every other PEG operation ends up calling this
  790. ## proc).
  791. ## Returns -1 if it does not match, else the length of the match
  792. # Set the handler generators to produce do-nothing handlers.
  793. template enter(pk, s, p, start) =
  794. discard
  795. template leave(pk, s, p, start, length) =
  796. discard
  797. matchOrParse(matchIt)
  798. {.cast(noSideEffect).}:
  799. # This cast is allowed because the `matchOrParse` template is used for
  800. # both matching and parsing, but side effects are only possible when it's
  801. # used by `eventParser`.
  802. result = matchIt(s, p, start, c)
  803. macro mkHandlerTplts(handlers: untyped): untyped =
  804. # Transforms the handler spec in *handlers* into handler templates.
  805. # The AST structure of *handlers[0]*:
  806. #
  807. # .. code-block::
  808. # StmtList
  809. # Call
  810. # Ident "pkNonTerminal"
  811. # StmtList
  812. # Call
  813. # Ident "enter"
  814. # StmtList
  815. # <handler code block>
  816. # Call
  817. # Ident "leave"
  818. # StmtList
  819. # <handler code block>
  820. # Call
  821. # Ident "pkChar"
  822. # StmtList
  823. # Call
  824. # Ident "leave"
  825. # StmtList
  826. # <handler code block>
  827. # ...
  828. func mkEnter(hdName, body: NimNode): NimNode =
  829. template helper(hdName, body) {.dirty.} =
  830. template hdName(s, p, start) =
  831. let s {.inject.} = s
  832. let p {.inject.} = p
  833. let start {.inject.} = start
  834. body
  835. result = getAst(helper(hdName, body))
  836. template mkLeave(hdPostf, body) {.dirty.} =
  837. # this has to be dirty to be able to capture *result* as *length* in
  838. # *leaveXX* calls.
  839. template `leave hdPostf`(s, p, start, length) =
  840. body
  841. result = newStmtList()
  842. for topCall in handlers[0]:
  843. if topCall.kind notin nnkCallKinds:
  844. error("Call syntax expected.", topCall)
  845. let pegKind = topCall[0]
  846. if pegKind.kind notin {nnkIdent, nnkSym}:
  847. error("PegKind expected.", pegKind)
  848. if 2 == topCall.len:
  849. for hdDef in topCall[1]:
  850. if hdDef.kind notin nnkCallKinds:
  851. error("Call syntax expected.", hdDef)
  852. if hdDef[0].kind notin {nnkIdent, nnkSym}:
  853. error("Handler identifier expected.", hdDef[0])
  854. if 2 == hdDef.len:
  855. let hdPostf = substr(pegKind.strVal, 2)
  856. case hdDef[0].strVal
  857. of "enter":
  858. result.add mkEnter(newIdentNode("enter" & hdPostf), hdDef[1])
  859. of "leave":
  860. result.add getAst(mkLeave(ident(hdPostf), hdDef[1]))
  861. else:
  862. error(
  863. "Unsupported handler identifier, expected 'enter' or 'leave'.",
  864. hdDef[0]
  865. )
  866. template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) =
  867. ## Generates an interpreting event parser *proc* according to the specified
  868. ## PEG AST and handler code blocks. The *proc* can be called with a string
  869. ## to be parsed and will execute the handler code blocks whenever their
  870. ## associated grammar element is matched. It returns -1 if the string does not
  871. ## match, else the length of the total match. The following example code
  872. ## evaluates an arithmetic expression defined by a simple PEG:
  873. ##
  874. ## .. code-block:: nim
  875. ## import std/[strutils, pegs]
  876. ##
  877. ## let
  878. ## pegAst = """
  879. ## Expr <- Sum
  880. ## Sum <- Product (('+' / '-')Product)*
  881. ## Product <- Value (('*' / '/')Value)*
  882. ## Value <- [0-9]+ / '(' Expr ')'
  883. ## """.peg
  884. ## txt = "(5+3)/2-7*22"
  885. ##
  886. ## var
  887. ## pStack: seq[string] = @[]
  888. ## valStack: seq[float] = @[]
  889. ## opStack = ""
  890. ## let
  891. ## parseArithExpr = pegAst.eventParser:
  892. ## pkNonTerminal:
  893. ## enter:
  894. ## pStack.add p.nt.name
  895. ## leave:
  896. ## pStack.setLen pStack.high
  897. ## if length > 0:
  898. ## let matchStr = s.substr(start, start+length-1)
  899. ## case p.nt.name
  900. ## of "Value":
  901. ## try:
  902. ## valStack.add matchStr.parseFloat
  903. ## echo valStack
  904. ## except ValueError:
  905. ## discard
  906. ## of "Sum", "Product":
  907. ## try:
  908. ## let val = matchStr.parseFloat
  909. ## except ValueError:
  910. ## if valStack.len > 1 and opStack.len > 0:
  911. ## valStack[^2] = case opStack[^1]
  912. ## of '+': valStack[^2] + valStack[^1]
  913. ## of '-': valStack[^2] - valStack[^1]
  914. ## of '*': valStack[^2] * valStack[^1]
  915. ## else: valStack[^2] / valStack[^1]
  916. ## valStack.setLen valStack.high
  917. ## echo valStack
  918. ## opStack.setLen opStack.high
  919. ## echo opStack
  920. ## pkChar:
  921. ## leave:
  922. ## if length == 1 and "Value" != pStack[^1]:
  923. ## let matchChar = s[start]
  924. ## opStack.add matchChar
  925. ## echo opStack
  926. ##
  927. ## let pLen = parseArithExpr(txt)
  928. ##
  929. ## The *handlers* parameter consists of code blocks for *PegKinds*,
  930. ## which define the grammar elements of interest. Each block can contain
  931. ## handler code to be executed when the parser enters and leaves text
  932. ## matching the grammar element. An *enter* handler can access the specific
  933. ## PEG AST node being matched as *p*, the entire parsed string as *s*
  934. ## and the position of the matched text segment in *s* as *start*. A *leave*
  935. ## handler can access *p*, *s*, *start* and also the length of the matched
  936. ## text segment as *length*. For an unsuccessful match, the *enter* and
  937. ## *leave* handlers will be executed, with *length* set to -1.
  938. ##
  939. ## Symbols declared in an *enter* handler can be made visible in the
  940. ## corresponding *leave* handler by annotating them with an *inject* pragma.
  941. proc rawParse(s: string, p: Peg, start: int, c: var Captures): int
  942. {.gensym.} =
  943. # binding from *macros*
  944. bind strVal
  945. mkHandlerTplts:
  946. handlers
  947. macro enter(pegKind, s, pegNode, start: untyped): untyped =
  948. # This is called by the matcher code in *matchOrParse* at the
  949. # start of the code for a grammar element of kind *pegKind*.
  950. # Expands to a call to the handler template if one was generated
  951. # by *mkHandlerTplts*.
  952. template mkDoEnter(hdPostf, s, pegNode, start) =
  953. when declared(`enter hdPostf`):
  954. `enter hdPostf`(s, pegNode, start):
  955. else:
  956. discard
  957. let hdPostf = ident(substr(strVal(pegKind), 2))
  958. getAst(mkDoEnter(hdPostf, s, pegNode, start))
  959. macro leave(pegKind, s, pegNode, start, length: untyped): untyped =
  960. # Like *enter*, but called at the end of the matcher code for
  961. # a grammar element of kind *pegKind*.
  962. template mkDoLeave(hdPostf, s, pegNode, start, length) =
  963. when declared(`leave hdPostf`):
  964. `leave hdPostf`(s, pegNode, start, length):
  965. else:
  966. discard
  967. let hdPostf = ident(substr(strVal(pegKind), 2))
  968. getAst(mkDoLeave(hdPostf, s, pegNode, start, length))
  969. matchOrParse(parseIt)
  970. parseIt(s, p, start, c)
  971. proc parser(s: string): int {.gensym.} =
  972. # the proc to be returned
  973. var
  974. ms: array[MaxSubpatterns, (int, int)]
  975. cs = Captures(matches: ms, ml: 0, origStart: 0)
  976. rawParse(s, pegAst, 0, cs)
  977. parser
  978. template fillMatches(s, caps, c) =
  979. for k in 0..c.ml-1:
  980. let startIdx = c.matches[k][0]
  981. let endIdx = c.matches[k][1]
  982. if startIdx != -1:
  983. caps[k] = substr(s, startIdx, endIdx)
  984. else:
  985. caps[k] = ""
  986. func matchLen*(s: string, pattern: Peg, matches: var openArray[string],
  987. start = 0): int {.rtl, extern: "npegs$1Capture".} =
  988. ## the same as ``match``, but it returns the length of the match,
  989. ## if there is no match, -1 is returned. Note that a match length
  990. ## of zero can happen. It's possible that a suffix of `s` remains
  991. ## that does not belong to the match.
  992. var c: Captures
  993. c.origStart = start
  994. result = rawMatch(s, pattern, start, c)
  995. if result >= 0: fillMatches(s, matches, c)
  996. func matchLen*(s: string, pattern: Peg,
  997. start = 0): int {.rtl, extern: "npegs$1".} =
  998. ## the same as ``match``, but it returns the length of the match,
  999. ## if there is no match, -1 is returned. Note that a match length
  1000. ## of zero can happen. It's possible that a suffix of `s` remains
  1001. ## that does not belong to the match.
  1002. var c: Captures
  1003. c.origStart = start
  1004. result = rawMatch(s, pattern, start, c)
  1005. func match*(s: string, pattern: Peg, matches: var openArray[string],
  1006. start = 0): bool {.rtl, extern: "npegs$1Capture".} =
  1007. ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and
  1008. ## the captured substrings in the array ``matches``. If it does not
  1009. ## match, nothing is written into ``matches`` and ``false`` is
  1010. ## returned.
  1011. result = matchLen(s, pattern, matches, start) != -1
  1012. func match*(s: string, pattern: Peg,
  1013. start = 0): bool {.rtl, extern: "npegs$1".} =
  1014. ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
  1015. result = matchLen(s, pattern, start) != -1
  1016. func find*(s: string, pattern: Peg, matches: var openArray[string],
  1017. start = 0): int {.rtl, extern: "npegs$1Capture".} =
  1018. ## returns the starting position of ``pattern`` in ``s`` and the captured
  1019. ## substrings in the array ``matches``. If it does not match, nothing
  1020. ## is written into ``matches`` and -1 is returned.
  1021. var c: Captures
  1022. c.origStart = start
  1023. for i in start .. s.len-1:
  1024. c.ml = 0
  1025. if rawMatch(s, pattern, i, c) >= 0:
  1026. fillMatches(s, matches, c)
  1027. return i
  1028. return -1
  1029. # could also use the pattern here: (!P .)* P
  1030. func findBounds*(s: string, pattern: Peg, matches: var openArray[string],
  1031. start = 0): tuple[first, last: int] {.
  1032. rtl, extern: "npegs$1Capture".} =
  1033. ## returns the starting position and end position of ``pattern`` in ``s``
  1034. ## and the captured
  1035. ## substrings in the array ``matches``. If it does not match, nothing
  1036. ## is written into ``matches`` and (-1,0) is returned.
  1037. var c: Captures
  1038. c.origStart = start
  1039. for i in start .. s.len-1:
  1040. c.ml = 0
  1041. var L = rawMatch(s, pattern, i, c)
  1042. if L >= 0:
  1043. fillMatches(s, matches, c)
  1044. return (i, i+L-1)
  1045. return (-1, 0)
  1046. func find*(s: string, pattern: Peg,
  1047. start = 0): int {.rtl, extern: "npegs$1".} =
  1048. ## returns the starting position of ``pattern`` in ``s``. If it does not
  1049. ## match, -1 is returned.
  1050. var c: Captures
  1051. c.origStart = start
  1052. for i in start .. s.len-1:
  1053. if rawMatch(s, pattern, i, c) >= 0: return i
  1054. return -1
  1055. iterator findAll*(s: string, pattern: Peg, start = 0): string =
  1056. ## yields all matching *substrings* of `s` that match `pattern`.
  1057. var c: Captures
  1058. c.origStart = start
  1059. var i = start
  1060. while i < s.len:
  1061. c.ml = 0
  1062. var L = rawMatch(s, pattern, i, c)
  1063. if L < 0:
  1064. inc(i, 1)
  1065. else:
  1066. yield substr(s, i, i+L-1)
  1067. inc(i, L)
  1068. func findAll*(s: string, pattern: Peg, start = 0): seq[string] {.
  1069. rtl, extern: "npegs$1".} =
  1070. ## returns all matching *substrings* of `s` that match `pattern`.
  1071. ## If it does not match, @[] is returned.
  1072. result = @[]
  1073. for it in findAll(s, pattern, start): result.add it
  1074. template `=~`*(s: string, pattern: Peg): bool =
  1075. ## This calls ``match`` with an implicit declared ``matches`` array that
  1076. ## can be used in the scope of the ``=~`` call:
  1077. ##
  1078. ## .. code-block:: nim
  1079. ##
  1080. ## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}":
  1081. ## # matches a key=value pair:
  1082. ## echo("Key: ", matches[0])
  1083. ## echo("Value: ", matches[1])
  1084. ## elif line =~ peg"\s*{'#'.*}":
  1085. ## # matches a comment
  1086. ## # note that the implicit ``matches`` array is different from the
  1087. ## # ``matches`` array of the first branch
  1088. ## echo("comment: ", matches[0])
  1089. ## else:
  1090. ## echo("syntax error")
  1091. ##
  1092. bind MaxSubpatterns
  1093. when not declaredInScope(matches):
  1094. var matches {.inject.}: array[0..MaxSubpatterns-1, string]
  1095. match(s, pattern, matches)
  1096. # ------------------------- more string handling ------------------------------
  1097. func contains*(s: string, pattern: Peg, start = 0): bool {.
  1098. rtl, extern: "npegs$1".} =
  1099. ## same as ``find(s, pattern, start) >= 0``
  1100. return find(s, pattern, start) >= 0
  1101. func contains*(s: string, pattern: Peg, matches: var openArray[string],
  1102. start = 0): bool {.rtl, extern: "npegs$1Capture".} =
  1103. ## same as ``find(s, pattern, matches, start) >= 0``
  1104. return find(s, pattern, matches, start) >= 0
  1105. func startsWith*(s: string, prefix: Peg, start = 0): bool {.
  1106. rtl, extern: "npegs$1".} =
  1107. ## returns true if `s` starts with the pattern `prefix`
  1108. result = matchLen(s, prefix, start) >= 0
  1109. func endsWith*(s: string, suffix: Peg, start = 0): bool {.
  1110. rtl, extern: "npegs$1".} =
  1111. ## returns true if `s` ends with the pattern `suffix`
  1112. var c: Captures
  1113. c.origStart = start
  1114. for i in start .. s.len-1:
  1115. if rawMatch(s, suffix, i, c) == s.len - i: return true
  1116. func replacef*(s: string, sub: Peg, by: string): string {.
  1117. rtl, extern: "npegs$1".} =
  1118. ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by`
  1119. ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples:
  1120. ##
  1121. ## .. code-block:: nim
  1122. ## "var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2")
  1123. ##
  1124. ## Results in:
  1125. ##
  1126. ## .. code-block:: nim
  1127. ##
  1128. ## "var1<-keykey; val2<-key2key2"
  1129. result = ""
  1130. var i = 0
  1131. var caps: array[0..MaxSubpatterns-1, string]
  1132. var c: Captures
  1133. while i < s.len:
  1134. c.ml = 0
  1135. var x = rawMatch(s, sub, i, c)
  1136. if x <= 0:
  1137. add(result, s[i])
  1138. inc(i)
  1139. else:
  1140. fillMatches(s, caps, c)
  1141. addf(result, by, caps)
  1142. inc(i, x)
  1143. add(result, substr(s, i))
  1144. func replace*(s: string, sub: Peg, by = ""): string {.
  1145. rtl, extern: "npegs$1".} =
  1146. ## Replaces `sub` in `s` by the string `by`. Captures cannot be accessed
  1147. ## in `by`.
  1148. result = ""
  1149. var i = 0
  1150. var c: Captures
  1151. while i < s.len:
  1152. var x = rawMatch(s, sub, i, c)
  1153. if x <= 0:
  1154. add(result, s[i])
  1155. inc(i)
  1156. else:
  1157. add(result, by)
  1158. inc(i, x)
  1159. add(result, substr(s, i))
  1160. func parallelReplace*(s: string, subs: varargs[
  1161. tuple[pattern: Peg, repl: string]]): string {.
  1162. rtl, extern: "npegs$1".} =
  1163. ## Returns a modified copy of `s` with the substitutions in `subs`
  1164. ## applied in parallel.
  1165. result = ""
  1166. var i = 0
  1167. var c: Captures
  1168. var caps: array[0..MaxSubpatterns-1, string]
  1169. while i < s.len:
  1170. block searchSubs:
  1171. for j in 0..high(subs):
  1172. c.ml = 0
  1173. var x = rawMatch(s, subs[j][0], i, c)
  1174. if x > 0:
  1175. fillMatches(s, caps, c)
  1176. addf(result, subs[j][1], caps)
  1177. inc(i, x)
  1178. break searchSubs
  1179. add(result, s[i])
  1180. inc(i)
  1181. # copy the rest:
  1182. add(result, substr(s, i))
  1183. when not defined(nimHasEffectsOf):
  1184. {.pragma: effectsOf.}
  1185. func replace*(s: string, sub: Peg, cb: proc(
  1186. match: int, cnt: int, caps: openArray[string]): string): string {.
  1187. rtl, extern: "npegs$1cb", effectsOf: cb.} =
  1188. ## Replaces `sub` in `s` by the resulting strings from the callback.
  1189. ## The callback proc receives the index of the current match (starting with 0),
  1190. ## the count of captures and an open array with the captures of each match. Examples:
  1191. ##
  1192. ## .. code-block:: nim
  1193. ##
  1194. ## func handleMatches*(m: int, n: int, c: openArray[string]): string =
  1195. ## result = ""
  1196. ## if m > 0:
  1197. ## result.add ", "
  1198. ## result.add case n:
  1199. ## of 2: c[0].toLower & ": '" & c[1] & "'"
  1200. ## of 1: c[0].toLower & ": ''"
  1201. ## else: ""
  1202. ##
  1203. ## let s = "Var1=key1;var2=Key2; VAR3"
  1204. ## echo s.replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches)
  1205. ##
  1206. ## Results in:
  1207. ##
  1208. ## .. code-block:: nim
  1209. ##
  1210. ## "var1: 'key1', var2: 'Key2', var3: ''"
  1211. result = ""
  1212. var i = 0
  1213. var caps: array[0..MaxSubpatterns-1, string]
  1214. var c: Captures
  1215. var m = 0
  1216. while i < s.len:
  1217. c.ml = 0
  1218. var x = rawMatch(s, sub, i, c)
  1219. if x <= 0:
  1220. add(result, s[i])
  1221. inc(i)
  1222. else:
  1223. fillMatches(s, caps, c)
  1224. add(result, cb(m, c.ml, caps))
  1225. inc(i, x)
  1226. inc(m)
  1227. add(result, substr(s, i))
  1228. when not defined(js):
  1229. proc transformFile*(infile, outfile: string,
  1230. subs: varargs[tuple[pattern: Peg, repl: string]]) {.
  1231. rtl, extern: "npegs$1".} =
  1232. ## reads in the file `infile`, performs a parallel replacement (calls
  1233. ## `parallelReplace`) and writes back to `outfile`. Raises ``IOError`` if an
  1234. ## error occurs. This is supposed to be used for quick scripting.
  1235. ##
  1236. ## **Note**: this proc does not exist while using the JS backend.
  1237. var x = readFile(infile)
  1238. writeFile(outfile, x.parallelReplace(subs))
  1239. iterator split*(s: string, sep: Peg): string =
  1240. ## Splits the string `s` into substrings.
  1241. ##
  1242. ## Substrings are separated by the PEG `sep`.
  1243. ## Examples:
  1244. ##
  1245. ## .. code-block:: nim
  1246. ## for word in split("00232this02939is39an22example111", peg"\d+"):
  1247. ## writeLine(stdout, word)
  1248. ##
  1249. ## Results in:
  1250. ##
  1251. ## .. code-block:: nim
  1252. ## "this"
  1253. ## "is"
  1254. ## "an"
  1255. ## "example"
  1256. ##
  1257. var c: Captures
  1258. var
  1259. first = 0
  1260. last = 0
  1261. while last < len(s):
  1262. c.ml = 0
  1263. var x = rawMatch(s, sep, last, c)
  1264. if x > 0: inc(last, x)
  1265. first = last
  1266. while last < len(s):
  1267. inc(last)
  1268. c.ml = 0
  1269. x = rawMatch(s, sep, last, c)
  1270. if x > 0: break
  1271. if first < last:
  1272. yield substr(s, first, last-1)
  1273. func split*(s: string, sep: Peg): seq[string] {.
  1274. rtl, extern: "npegs$1".} =
  1275. ## Splits the string `s` into substrings.
  1276. result = @[]
  1277. for it in split(s, sep): result.add it
  1278. # ------------------- scanner -------------------------------------------------
  1279. type
  1280. Modifier = enum
  1281. modNone,
  1282. modVerbatim,
  1283. modIgnoreCase,
  1284. modIgnoreStyle
  1285. TokKind = enum ## enumeration of all tokens
  1286. tkInvalid, ## invalid token
  1287. tkEof, ## end of file reached
  1288. tkAny, ## .
  1289. tkAnyRune, ## _
  1290. tkIdentifier, ## abc
  1291. tkStringLit, ## "abc" or 'abc'
  1292. tkCharSet, ## [^A-Z]
  1293. tkParLe, ## '('
  1294. tkParRi, ## ')'
  1295. tkCurlyLe, ## '{'
  1296. tkCurlyRi, ## '}'
  1297. tkCurlyAt, ## '{@}'
  1298. tkEmptyCurl, ## '{}'
  1299. tkArrow, ## '<-'
  1300. tkBar, ## '/'
  1301. tkStar, ## '*'
  1302. tkPlus, ## '+'
  1303. tkAmp, ## '&'
  1304. tkNot, ## '!'
  1305. tkOption, ## '?'
  1306. tkAt, ## '@'
  1307. tkBuiltin, ## \identifier
  1308. tkEscaped, ## \\
  1309. tkBackref, ## '$'
  1310. tkDollar, ## '$'
  1311. tkHat ## '^'
  1312. Token {.final.} = object ## a token
  1313. kind: TokKind ## the type of the token
  1314. modifier: Modifier
  1315. literal: string ## the parsed (string) literal
  1316. charset: set[char] ## if kind == tkCharSet
  1317. index: int ## if kind == tkBackref
  1318. PegLexer {.inheritable.} = object ## the lexer object.
  1319. bufpos: int ## the current position within the buffer
  1320. buf: string ## the buffer itself
  1321. lineNumber: int ## the current line number
  1322. lineStart: int ## index of last line start in buffer
  1323. colOffset: int ## column to add
  1324. filename: string
  1325. const
  1326. tokKindToStr: array[TokKind, string] = [
  1327. "invalid", "[EOF]", ".", "_", "identifier", "string literal",
  1328. "character set", "(", ")", "{", "}", "{@}", "{}",
  1329. "<-", "/", "*", "+", "&", "!", "?",
  1330. "@", "built-in", "escaped", "$", "$", "^"
  1331. ]
  1332. func handleCR(L: var PegLexer, pos: int): int =
  1333. assert(L.buf[pos] == '\c')
  1334. inc(L.lineNumber)
  1335. result = pos+1
  1336. if result < L.buf.len and L.buf[result] == '\L': inc(result)
  1337. L.lineStart = result
  1338. func handleLF(L: var PegLexer, pos: int): int =
  1339. assert(L.buf[pos] == '\L')
  1340. inc(L.lineNumber)
  1341. result = pos+1
  1342. L.lineStart = result
  1343. func init(L: var PegLexer, input, filename: string, line = 1, col = 0) =
  1344. L.buf = input
  1345. L.bufpos = 0
  1346. L.lineNumber = line
  1347. L.colOffset = col
  1348. L.lineStart = 0
  1349. L.filename = filename
  1350. func getColumn(L: PegLexer): int {.inline.} =
  1351. result = abs(L.bufpos - L.lineStart) + L.colOffset
  1352. func getLine(L: PegLexer): int {.inline.} =
  1353. result = L.lineNumber
  1354. func errorStr(L: PegLexer, msg: string, line = -1, col = -1): string =
  1355. var line = if line < 0: getLine(L) else: line
  1356. var col = if col < 0: getColumn(L) else: col
  1357. result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg]
  1358. func getEscapedChar(c: var PegLexer, tok: var Token) =
  1359. inc(c.bufpos)
  1360. if c.bufpos >= len(c.buf):
  1361. tok.kind = tkInvalid
  1362. return
  1363. case c.buf[c.bufpos]
  1364. of 'r', 'R', 'c', 'C':
  1365. add(tok.literal, '\c')
  1366. inc(c.bufpos)
  1367. of 'l', 'L':
  1368. add(tok.literal, '\L')
  1369. inc(c.bufpos)
  1370. of 'f', 'F':
  1371. add(tok.literal, '\f')
  1372. inc(c.bufpos)
  1373. of 'e', 'E':
  1374. add(tok.literal, '\e')
  1375. inc(c.bufpos)
  1376. of 'a', 'A':
  1377. add(tok.literal, '\a')
  1378. inc(c.bufpos)
  1379. of 'b', 'B':
  1380. add(tok.literal, '\b')
  1381. inc(c.bufpos)
  1382. of 'v', 'V':
  1383. add(tok.literal, '\v')
  1384. inc(c.bufpos)
  1385. of 't', 'T':
  1386. add(tok.literal, '\t')
  1387. inc(c.bufpos)
  1388. of 'x', 'X':
  1389. inc(c.bufpos)
  1390. if c.bufpos >= len(c.buf):
  1391. tok.kind = tkInvalid
  1392. return
  1393. var xi = 0
  1394. if handleHexChar(c.buf[c.bufpos], xi):
  1395. inc(c.bufpos)
  1396. if handleHexChar(c.buf[c.bufpos], xi):
  1397. inc(c.bufpos)
  1398. if xi == 0: tok.kind = tkInvalid
  1399. else: add(tok.literal, chr(xi))
  1400. of '0'..'9':
  1401. var val = ord(c.buf[c.bufpos]) - ord('0')
  1402. inc(c.bufpos)
  1403. var i = 1
  1404. while (c.bufpos < len(c.buf)) and (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}):
  1405. val = val * 10 + ord(c.buf[c.bufpos]) - ord('0')
  1406. inc(c.bufpos)
  1407. inc(i)
  1408. if val > 0 and val <= 255: add(tok.literal, chr(val))
  1409. else: tok.kind = tkInvalid
  1410. of '\0'..'\31':
  1411. tok.kind = tkInvalid
  1412. elif c.buf[c.bufpos] in strutils.Letters:
  1413. tok.kind = tkInvalid
  1414. else:
  1415. add(tok.literal, c.buf[c.bufpos])
  1416. inc(c.bufpos)
  1417. func skip(c: var PegLexer) =
  1418. var pos = c.bufpos
  1419. while pos < c.buf.len:
  1420. case c.buf[pos]
  1421. of ' ', '\t':
  1422. inc(pos)
  1423. of '#':
  1424. while (pos < c.buf.len) and
  1425. not (c.buf[pos] in {'\c', '\L', '\0'}): inc(pos)
  1426. of '\c':
  1427. pos = handleCR(c, pos)
  1428. of '\L':
  1429. pos = handleLF(c, pos)
  1430. else:
  1431. break # EndOfFile also leaves the loop
  1432. c.bufpos = pos
  1433. func getString(c: var PegLexer, tok: var Token) =
  1434. tok.kind = tkStringLit
  1435. var pos = c.bufpos + 1
  1436. var quote = c.buf[pos-1]
  1437. while pos < c.buf.len:
  1438. case c.buf[pos]
  1439. of '\\':
  1440. c.bufpos = pos
  1441. getEscapedChar(c, tok)
  1442. pos = c.bufpos
  1443. of '\c', '\L', '\0':
  1444. tok.kind = tkInvalid
  1445. break
  1446. elif c.buf[pos] == quote:
  1447. inc(pos)
  1448. break
  1449. else:
  1450. add(tok.literal, c.buf[pos])
  1451. inc(pos)
  1452. c.bufpos = pos
  1453. func getDollar(c: var PegLexer, tok: var Token) =
  1454. var pos = c.bufpos + 1
  1455. var neg = false
  1456. if pos < c.buf.len and c.buf[pos] == '^':
  1457. neg = true
  1458. inc(pos)
  1459. if pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
  1460. tok.kind = tkBackref
  1461. tok.index = 0
  1462. while pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
  1463. tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0')
  1464. inc(pos)
  1465. if neg:
  1466. tok.index = -tok.index
  1467. else:
  1468. if neg:
  1469. dec(pos)
  1470. tok.kind = tkDollar
  1471. c.bufpos = pos
  1472. func getCharSet(c: var PegLexer, tok: var Token) =
  1473. tok.kind = tkCharSet
  1474. tok.charset = {}
  1475. var pos = c.bufpos + 1
  1476. var caret = false
  1477. if pos < c.buf.len:
  1478. if c.buf[pos] == '^':
  1479. inc(pos)
  1480. caret = true
  1481. while pos < c.buf.len:
  1482. var ch: char
  1483. case c.buf[pos]
  1484. of ']':
  1485. if pos < c.buf.len: inc(pos)
  1486. break
  1487. of '\\':
  1488. c.bufpos = pos
  1489. getEscapedChar(c, tok)
  1490. pos = c.bufpos
  1491. ch = tok.literal[tok.literal.len-1]
  1492. of '\C', '\L', '\0':
  1493. tok.kind = tkInvalid
  1494. break
  1495. else:
  1496. ch = c.buf[pos]
  1497. inc(pos)
  1498. incl(tok.charset, ch)
  1499. if c.buf[pos] == '-':
  1500. if pos+1 < c.buf.len and c.buf[pos+1] == ']':
  1501. incl(tok.charset, '-')
  1502. inc(pos)
  1503. else:
  1504. if pos+1 < c.buf.len:
  1505. inc(pos)
  1506. else:
  1507. break
  1508. var ch2: char
  1509. case c.buf[pos]
  1510. of '\\':
  1511. c.bufpos = pos
  1512. getEscapedChar(c, tok)
  1513. pos = c.bufpos
  1514. ch2 = tok.literal[tok.literal.len-1]
  1515. of '\C', '\L', '\0':
  1516. tok.kind = tkInvalid
  1517. break
  1518. else:
  1519. if pos+1 < c.buf.len:
  1520. ch2 = c.buf[pos]
  1521. inc(pos)
  1522. else:
  1523. break
  1524. for i in ord(ch)+1 .. ord(ch2):
  1525. incl(tok.charset, chr(i))
  1526. c.bufpos = pos
  1527. if caret: tok.charset = {'\1'..'\xFF'} - tok.charset
  1528. func getSymbol(c: var PegLexer, tok: var Token) =
  1529. var pos = c.bufpos
  1530. while pos < c.buf.len:
  1531. add(tok.literal, c.buf[pos])
  1532. inc(pos)
  1533. if pos < c.buf.len and c.buf[pos] notin strutils.IdentChars: break
  1534. c.bufpos = pos
  1535. tok.kind = tkIdentifier
  1536. func getBuiltin(c: var PegLexer, tok: var Token) =
  1537. if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters:
  1538. inc(c.bufpos)
  1539. getSymbol(c, tok)
  1540. tok.kind = tkBuiltin
  1541. else:
  1542. tok.kind = tkEscaped
  1543. getEscapedChar(c, tok) # may set tok.kind to tkInvalid
  1544. func getTok(c: var PegLexer, tok: var Token) =
  1545. tok.kind = tkInvalid
  1546. tok.modifier = modNone
  1547. setLen(tok.literal, 0)
  1548. skip(c)
  1549. if c.bufpos >= c.buf.len:
  1550. tok.kind = tkEof
  1551. tok.literal = "[EOF]"
  1552. add(tok.literal, '\0')
  1553. inc(c.bufpos)
  1554. return
  1555. case c.buf[c.bufpos]
  1556. of '{':
  1557. inc(c.bufpos)
  1558. if c.buf[c.bufpos] == '@' and c.bufpos+2 < c.buf.len and
  1559. c.buf[c.bufpos+1] == '}':
  1560. tok.kind = tkCurlyAt
  1561. inc(c.bufpos, 2)
  1562. add(tok.literal, "{@}")
  1563. elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len:
  1564. tok.kind = tkEmptyCurl
  1565. inc(c.bufpos)
  1566. add(tok.literal, "{}")
  1567. else:
  1568. tok.kind = tkCurlyLe
  1569. add(tok.literal, '{')
  1570. of '}':
  1571. tok.kind = tkCurlyRi
  1572. inc(c.bufpos)
  1573. add(tok.literal, '}')
  1574. of '[':
  1575. getCharSet(c, tok)
  1576. of '(':
  1577. tok.kind = tkParLe
  1578. inc(c.bufpos)
  1579. add(tok.literal, '(')
  1580. of ')':
  1581. tok.kind = tkParRi
  1582. inc(c.bufpos)
  1583. add(tok.literal, ')')
  1584. of '.':
  1585. tok.kind = tkAny
  1586. inc(c.bufpos)
  1587. add(tok.literal, '.')
  1588. of '_':
  1589. tok.kind = tkAnyRune
  1590. inc(c.bufpos)
  1591. add(tok.literal, '_')
  1592. of '\\':
  1593. getBuiltin(c, tok)
  1594. of '\'', '"': getString(c, tok)
  1595. of '$': getDollar(c, tok)
  1596. of 'a'..'z', 'A'..'Z', '\128'..'\255':
  1597. getSymbol(c, tok)
  1598. if c.bufpos >= c.buf.len:
  1599. return
  1600. if c.buf[c.bufpos] in {'\'', '"'} or
  1601. c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and
  1602. c.buf[c.bufpos+1] in {'^', '0'..'9'}:
  1603. case tok.literal
  1604. of "i": tok.modifier = modIgnoreCase
  1605. of "y": tok.modifier = modIgnoreStyle
  1606. of "v": tok.modifier = modVerbatim
  1607. else: discard
  1608. setLen(tok.literal, 0)
  1609. if c.buf[c.bufpos] == '$':
  1610. getDollar(c, tok)
  1611. else:
  1612. getString(c, tok)
  1613. if tok.modifier == modNone: tok.kind = tkInvalid
  1614. of '+':
  1615. tok.kind = tkPlus
  1616. inc(c.bufpos)
  1617. add(tok.literal, '+')
  1618. of '*':
  1619. tok.kind = tkStar
  1620. inc(c.bufpos)
  1621. add(tok.literal, '+')
  1622. of '<':
  1623. if c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '-':
  1624. inc(c.bufpos, 2)
  1625. tok.kind = tkArrow
  1626. add(tok.literal, "<-")
  1627. else:
  1628. add(tok.literal, '<')
  1629. of '/':
  1630. tok.kind = tkBar
  1631. inc(c.bufpos)
  1632. add(tok.literal, '/')
  1633. of '?':
  1634. tok.kind = tkOption
  1635. inc(c.bufpos)
  1636. add(tok.literal, '?')
  1637. of '!':
  1638. tok.kind = tkNot
  1639. inc(c.bufpos)
  1640. add(tok.literal, '!')
  1641. of '&':
  1642. tok.kind = tkAmp
  1643. inc(c.bufpos)
  1644. add(tok.literal, '!')
  1645. of '@':
  1646. tok.kind = tkAt
  1647. inc(c.bufpos)
  1648. add(tok.literal, '@')
  1649. if c.buf[c.bufpos] == '@':
  1650. tok.kind = tkCurlyAt
  1651. inc(c.bufpos)
  1652. add(tok.literal, '@')
  1653. of '^':
  1654. tok.kind = tkHat
  1655. inc(c.bufpos)
  1656. add(tok.literal, '^')
  1657. else:
  1658. if c.bufpos >= c.buf.len:
  1659. tok.kind = tkEof
  1660. tok.literal = "[EOF]"
  1661. add(tok.literal, c.buf[c.bufpos])
  1662. inc(c.bufpos)
  1663. func arrowIsNextTok(c: PegLexer): bool =
  1664. # the only look ahead we need
  1665. var pos = c.bufpos
  1666. while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos)
  1667. if pos+1 >= c.buf.len:
  1668. return
  1669. result = c.buf[pos] == '<' and c.buf[pos+1] == '-'
  1670. # ----------------------------- parser ----------------------------------------
  1671. type
  1672. EInvalidPeg* = object of ValueError ## raised if an invalid
  1673. ## PEG has been detected
  1674. PegParser = object of PegLexer ## the PEG parser object
  1675. tok: Token
  1676. nonterms: seq[NonTerminal]
  1677. modifier: Modifier
  1678. captures: int
  1679. identIsVerbatim: bool
  1680. skip: Peg
  1681. func pegError(p: PegParser, msg: string, line = -1, col = -1) =
  1682. var e: ref EInvalidPeg
  1683. new(e)
  1684. e.msg = errorStr(p, msg, line, col)
  1685. raise e
  1686. func getTok(p: var PegParser) =
  1687. getTok(p, p.tok)
  1688. if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token")
  1689. func eat(p: var PegParser, kind: TokKind) =
  1690. if p.tok.kind == kind: getTok(p)
  1691. else: pegError(p, tokKindToStr[kind] & " expected")
  1692. func parseExpr(p: var PegParser): Peg {.gcsafe.}
  1693. func getNonTerminal(p: var PegParser, name: string): NonTerminal =
  1694. for i in 0..high(p.nonterms):
  1695. result = p.nonterms[i]
  1696. if cmpIgnoreStyle(result.name, name) == 0: return
  1697. # forward reference:
  1698. result = newNonTerminal(name, getLine(p), getColumn(p))
  1699. add(p.nonterms, result)
  1700. func modifiedTerm(s: string, m: Modifier): Peg =
  1701. case m
  1702. of modNone, modVerbatim: result = term(s)
  1703. of modIgnoreCase: result = termIgnoreCase(s)
  1704. of modIgnoreStyle: result = termIgnoreStyle(s)
  1705. func modifiedBackref(s: int, m: Modifier): Peg =
  1706. var
  1707. reverse = s < 0
  1708. index = if reverse: -s else: s
  1709. case m
  1710. of modNone, modVerbatim: result = backref(index, reverse)
  1711. of modIgnoreCase: result = backrefIgnoreCase(index, reverse)
  1712. of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse)
  1713. func builtin(p: var PegParser): Peg =
  1714. # do not use "y", "skip" or "i" as these would be ambiguous
  1715. case p.tok.literal
  1716. of "n": result = newLine()
  1717. of "d": result = charSet({'0'..'9'})
  1718. of "D": result = charSet({'\1'..'\xff'} - {'0'..'9'})
  1719. of "s": result = charSet({' ', '\9'..'\13'})
  1720. of "S": result = charSet({'\1'..'\xff'} - {' ', '\9'..'\13'})
  1721. of "w": result = charSet({'a'..'z', 'A'..'Z', '_', '0'..'9'})
  1722. of "W": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_', '0'..'9'})
  1723. of "a": result = charSet({'a'..'z', 'A'..'Z'})
  1724. of "A": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'})
  1725. of "ident": result = pegs.ident
  1726. of "letter": result = unicodeLetter()
  1727. of "upper": result = unicodeUpper()
  1728. of "lower": result = unicodeLower()
  1729. of "title": result = unicodeTitle()
  1730. of "white": result = unicodeWhitespace()
  1731. else: pegError(p, "unknown built-in: " & p.tok.literal)
  1732. func token(terminal: Peg, p: PegParser): Peg =
  1733. if p.skip.kind == pkEmpty: result = terminal
  1734. else: result = sequence(p.skip, terminal)
  1735. func primary(p: var PegParser): Peg =
  1736. case p.tok.kind
  1737. of tkAmp:
  1738. getTok(p)
  1739. return &primary(p)
  1740. of tkNot:
  1741. getTok(p)
  1742. return !primary(p)
  1743. of tkAt:
  1744. getTok(p)
  1745. return !*primary(p)
  1746. of tkCurlyAt:
  1747. getTok(p)
  1748. return !*\primary(p).token(p)
  1749. else: discard
  1750. case p.tok.kind
  1751. of tkIdentifier:
  1752. if p.identIsVerbatim:
  1753. var m = p.tok.modifier
  1754. if m == modNone: m = p.modifier
  1755. result = modifiedTerm(p.tok.literal, m).token(p)
  1756. getTok(p)
  1757. elif not arrowIsNextTok(p):
  1758. var nt = getNonTerminal(p, p.tok.literal)
  1759. incl(nt.flags, ntUsed)
  1760. result = nonterminal(nt).token(p)
  1761. getTok(p)
  1762. else:
  1763. pegError(p, "expression expected, but found: " & p.tok.literal)
  1764. of tkStringLit:
  1765. var m = p.tok.modifier
  1766. if m == modNone: m = p.modifier
  1767. result = modifiedTerm(p.tok.literal, m).token(p)
  1768. getTok(p)
  1769. of tkCharSet:
  1770. if '\0' in p.tok.charset:
  1771. pegError(p, "binary zero ('\\0') not allowed in character class")
  1772. result = charSet(p.tok.charset).token(p)
  1773. getTok(p)
  1774. of tkParLe:
  1775. getTok(p)
  1776. result = parseExpr(p)
  1777. eat(p, tkParRi)
  1778. of tkCurlyLe:
  1779. getTok(p)
  1780. result = capture(parseExpr(p)).token(p)
  1781. eat(p, tkCurlyRi)
  1782. inc(p.captures)
  1783. of tkEmptyCurl:
  1784. result = capture()
  1785. getTok(p)
  1786. of tkAny:
  1787. result = any().token(p)
  1788. getTok(p)
  1789. of tkAnyRune:
  1790. result = anyRune().token(p)
  1791. getTok(p)
  1792. of tkBuiltin:
  1793. result = builtin(p).token(p)
  1794. getTok(p)
  1795. of tkEscaped:
  1796. result = term(p.tok.literal[0]).token(p)
  1797. getTok(p)
  1798. of tkDollar:
  1799. result = endAnchor()
  1800. getTok(p)
  1801. of tkHat:
  1802. result = startAnchor()
  1803. getTok(p)
  1804. of tkBackref:
  1805. if abs(p.tok.index) > p.captures or p.tok.index == 0:
  1806. pegError(p, "invalid back reference index: " & $p.tok.index)
  1807. var m = p.tok.modifier
  1808. if m == modNone: m = p.modifier
  1809. result = modifiedBackref(p.tok.index, m).token(p)
  1810. getTok(p)
  1811. else:
  1812. pegError(p, "expression expected, but found: " & p.tok.literal)
  1813. getTok(p) # we must consume a token here to prevent endless loops!
  1814. while true:
  1815. case p.tok.kind
  1816. of tkOption:
  1817. result = ?result
  1818. getTok(p)
  1819. of tkStar:
  1820. result = *result
  1821. getTok(p)
  1822. of tkPlus:
  1823. result = +result
  1824. getTok(p)
  1825. else: break
  1826. func seqExpr(p: var PegParser): Peg =
  1827. result = primary(p)
  1828. while true:
  1829. case p.tok.kind
  1830. of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe,
  1831. tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
  1832. tkHat, tkCurlyAt, tkEmptyCurl:
  1833. result = sequence(result, primary(p))
  1834. of tkIdentifier:
  1835. if not arrowIsNextTok(p):
  1836. result = sequence(result, primary(p))
  1837. else: break
  1838. else: break
  1839. func parseExpr(p: var PegParser): Peg =
  1840. result = seqExpr(p)
  1841. while p.tok.kind == tkBar:
  1842. getTok(p)
  1843. result = result / seqExpr(p)
  1844. func parseRule(p: var PegParser): NonTerminal =
  1845. if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
  1846. result = getNonTerminal(p, p.tok.literal)
  1847. if ntDeclared in result.flags:
  1848. pegError(p, "attempt to redefine: " & result.name)
  1849. result.line = getLine(p)
  1850. result.col = getColumn(p)
  1851. getTok(p)
  1852. eat(p, tkArrow)
  1853. result.rule = parseExpr(p)
  1854. incl(result.flags, ntDeclared) # NOW inlining may be attempted
  1855. else:
  1856. pegError(p, "rule expected, but found: " & p.tok.literal)
  1857. func rawParse(p: var PegParser): Peg =
  1858. ## parses a rule or a PEG expression
  1859. while p.tok.kind == tkBuiltin:
  1860. case p.tok.literal
  1861. of "i":
  1862. p.modifier = modIgnoreCase
  1863. getTok(p)
  1864. of "y":
  1865. p.modifier = modIgnoreStyle
  1866. getTok(p)
  1867. of "skip":
  1868. getTok(p)
  1869. p.skip = ?primary(p)
  1870. else: break
  1871. if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
  1872. result = parseRule(p).rule
  1873. while p.tok.kind != tkEof:
  1874. discard parseRule(p)
  1875. else:
  1876. p.identIsVerbatim = true
  1877. result = parseExpr(p)
  1878. if p.tok.kind != tkEof:
  1879. pegError(p, "EOF expected, but found: " & p.tok.literal)
  1880. for i in 0..high(p.nonterms):
  1881. var nt = p.nonterms[i]
  1882. if ntDeclared notin nt.flags:
  1883. pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col)
  1884. elif ntUsed notin nt.flags and i > 0:
  1885. pegError(p, "unused rule: " & nt.name, nt.line, nt.col)
  1886. func parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg =
  1887. ## constructs a Peg object from `pattern`. `filename`, `line`, `col` are
  1888. ## used for error messages, but they only provide start offsets. `parsePeg`
  1889. ## keeps track of line and column numbers within `pattern`.
  1890. var p: PegParser
  1891. init(PegLexer(p), pattern, filename, line, col)
  1892. p.tok.kind = tkInvalid
  1893. p.tok.modifier = modNone
  1894. p.tok.literal = ""
  1895. p.tok.charset = {}
  1896. p.nonterms = @[]
  1897. p.identIsVerbatim = false
  1898. getTok(p)
  1899. result = rawParse(p)
  1900. func peg*(pattern: string): Peg =
  1901. ## constructs a Peg object from the `pattern`. The short name has been
  1902. ## chosen to encourage its use as a raw string modifier::
  1903. ##
  1904. ## peg"{\ident} \s* '=' \s* {.*}"
  1905. result = parsePeg(pattern, "pattern")
  1906. func escapePeg*(s: string): string =
  1907. ## escapes `s` so that it is matched verbatim when used as a peg.
  1908. result = ""
  1909. var inQuote = false
  1910. for c in items(s):
  1911. case c
  1912. of '\0'..'\31', '\'', '"', '\\':
  1913. if inQuote:
  1914. result.add('\'')
  1915. inQuote = false
  1916. result.add("\\x")
  1917. result.add(toHex(ord(c), 2))
  1918. else:
  1919. if not inQuote:
  1920. result.add('\'')
  1921. inQuote = true
  1922. result.add(c)
  1923. if inQuote: result.add('\'')