lexer.lua 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. --lexer.lua is part of Penlight http://stevedonovan.github.com/Penlight/
  2. --[[
  3. Copyright (C) 2009 Steve Donovan, David Manura.
  4. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
  5. associated documentation files (the "Software"), to deal in the Software without restriction,
  6. including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
  7. and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
  8. subject to the following conditions:
  9. The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  10. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  11. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
  12. FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  13. WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  14. ]]
  15. -- NOTE: This version is modified to handle love2d calls separately and has a few other tweaks
  16. --- Lexical scanner for creating a sequence of tokens from text.
  17. -- `lexer.scan(s)` returns an iterator over all tokens found in the
  18. -- string `s`. This iterator returns two values, a token type string
  19. -- (such as 'string' for quoted string, 'iden' for identifier) and the value of the
  20. -- token.
  21. --
  22. -- Versions specialized for Lua and C are available; these also handle block comments
  23. -- and classify keywords as 'keyword' tokens. For example:
  24. --
  25. -- > s = 'for i=1,n do'
  26. -- > for t,v in lexer.lua(s) do print(t,v) end
  27. -- keyword for
  28. -- iden i
  29. -- = =
  30. -- number 1
  31. -- , ,
  32. -- iden n
  33. -- keyword do
  34. --
  35. -- See the Guide for further @{06-data.md.Lexical_Scanning|discussion}
  36. -- @module pl.lexer
  37. local yield,wrap = coroutine.yield,coroutine.wrap
  38. local strfind = string.find
  39. local strsub = string.sub
  40. local append = table.insert
  41. local function assert_arg(idx,val,tp)
  42. if type(val) ~= tp then
  43. error("argument "..idx.." must be "..tp, 2)
  44. end
  45. end
  46. local lexer = {}
  47. local NUMBER1 = '^[+%-]?%d+%.?%d*[eE][+%-]?%d+'
  48. local NUMBER2 = '^[+%-]?%d+%.?%d*'
  49. local NUMBER3 = '^0[xX]%x+'
  50. local NUMBER4 = '^%.%d+[eE][+%-]?%d+' -- e.g. .123E(+)768
  51. local NUMBER5 = '^%.%d+'
  52. local NUMBER6 = '^%d+%.?%d*[eE][+%-]?%d+' -- e.g. 123(.)(456)E(+)768
  53. local NUMBER7 = '^%d+%.?%d*'
  54. local IDEN = '^[%a_][%w_.]*'
  55. local WSPACE = '^%s+'
  56. local STRING1 = "^(['\"])%1" -- empty string
  57. local STRING2 = [[^(['"])(\*)%2%1]]
  58. local STRING3 = [[^(['"]).-[^\](\*)%2%1]]
  59. local CHAR1 = "^''"
  60. local CHAR2 = [[^'(\*)%1']]
  61. local CHAR3 = [[^'.-[^\](\*)%1']]
  62. local PREPRO = '^#.-[^\\]\n'
  63. local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword
  64. local function tdump(tok)
  65. return yield(tok,tok)
  66. end
  67. local function ndump(tok,options)
  68. if options and options.number then
  69. tok = tonumber(tok)
  70. end
  71. return yield("number",tok)
  72. end
  73. -- regular strings, single or double quotes; usually we want them
  74. -- without the quotes
  75. local function sdump(tok,options)
  76. if options and options.string then
  77. tok = tok:sub(2,-2)
  78. end
  79. return yield("string",tok)
  80. end
  81. -- long Lua strings need extra work to get rid of the quotes
  82. local function sdump_l(tok,options,findres)
  83. if options and options.string then
  84. local quotelen = 3
  85. if findres[3] then
  86. quotelen = quotelen + findres[3]:len()
  87. end
  88. tok = tok:sub(quotelen, -quotelen)
  89. if tok:sub(1, 1) == "\n" then
  90. tok = tok:sub(2)
  91. end
  92. end
  93. return yield("string",tok)
  94. end
  95. local function chdump(tok,options)
  96. if options and options.string then
  97. tok = tok:sub(2,-2)
  98. end
  99. return yield("char",tok)
  100. end
  101. local function cdump(tok)
  102. return yield('comment',tok)
  103. end
  104. local function wsdump (tok)
  105. return yield("space",tok)
  106. end
  107. local function pdump (tok)
  108. return yield('prepro',tok)
  109. end
  110. local function plain_vdump(tok)
  111. return yield("iden",tok)
  112. end
  113. local IDENLOVE = '^(love[%w_.]*%.)([%a_][%w_]*)$'
  114. local function lua_vdump(tok)
  115. if lua_keyword[tok] then
  116. return yield("keyword",tok)
  117. end
  118. local m, n = string.match(tok, IDENLOVE)
  119. if m then
  120. return yield("idenlovepre", m), yield("idenlove", n)
  121. end
  122. return yield("iden",tok)
  123. end
  124. local function cpp_vdump(tok)
  125. if cpp_keyword[tok] then
  126. return yield("keyword",tok)
  127. else
  128. return yield("iden",tok)
  129. end
  130. end
  131. --- create a plain token iterator from a string or file-like object.
  132. -- @tparam string|file s a string or a file-like object with `:read()` method returning lines.
  133. -- @tab matches an optional match table - array of token descriptions.
  134. -- A token is described by a `{pattern, action}` pair, where `pattern` should match
  135. -- token body and `action` is a function called when a token of described type is found.
  136. -- @tab[opt] filter a table of token types to exclude, by default `{space=true}`
  137. -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
  138. -- which means convert numbers and strip string quotes.
  139. function lexer.scan(s,matches,filter,options)
  140. local file = type(s) ~= 'string' and s
  141. filter = filter or {space=true}
  142. options = options or {number=true,string=true}
  143. if filter then
  144. if filter.space then filter[wsdump] = true end
  145. if filter.comments then
  146. filter[cdump] = true
  147. end
  148. end
  149. if not matches then
  150. if not plain_matches then
  151. plain_matches = {
  152. {WSPACE,wsdump},
  153. {NUMBER3,ndump},
  154. {IDEN,plain_vdump},
  155. {NUMBER1,ndump},
  156. {NUMBER2,ndump},
  157. {STRING1,sdump},
  158. {STRING2,sdump},
  159. {STRING3,sdump},
  160. {'^.',tdump}
  161. }
  162. end
  163. matches = plain_matches
  164. end
  165. local function lex(first_arg)
  166. local line_nr = 0
  167. local next_line = file and file:read()
  168. local sz = file and 0 or #s
  169. local idx = 1
  170. -- res is the value used to resume the coroutine.
  171. local function handle_requests(res)
  172. while res do
  173. local tp = type(res)
  174. -- insert a token list
  175. if tp == 'table' then
  176. res = yield('','')
  177. for _,t in ipairs(res) do
  178. res = yield(t[1],t[2])
  179. end
  180. elseif tp == 'string' then -- or search up to some special pattern
  181. local i1,i2 = strfind(s,res,idx)
  182. if i1 then
  183. local tok = strsub(s,i1,i2)
  184. idx = i2 + 1
  185. res = yield('',tok)
  186. else
  187. res = yield('','')
  188. idx = sz + 1
  189. end
  190. else
  191. res = yield(line_nr,idx)
  192. end
  193. end
  194. end
  195. handle_requests(first_arg)
  196. if not file then line_nr = 1 end
  197. while true do
  198. if idx > sz then
  199. if file then
  200. if not next_line then return end
  201. s = next_line
  202. line_nr = line_nr + 1
  203. next_line = file:read()
  204. if next_line then
  205. s = s .. '\n'
  206. end
  207. idx, sz = 1, #s
  208. else
  209. while true do
  210. handle_requests(yield())
  211. end
  212. end
  213. end
  214. for _,m in ipairs(matches) do
  215. local pat = m[1]
  216. local fun = m[2]
  217. local findres = {strfind(s,pat,idx)}
  218. local i1, i2 = findres[1], findres[2]
  219. if i1 then
  220. local tok = strsub(s,i1,i2)
  221. idx = i2 + 1
  222. local res
  223. if not (filter and filter[fun]) then
  224. lexer.finished = idx > sz
  225. res = fun(tok, options, findres)
  226. end
  227. if not file and tok:find("\n") then
  228. -- Update line number.
  229. local _, newlines = tok:gsub("\n", {})
  230. line_nr = line_nr + newlines
  231. end
  232. handle_requests(res)
  233. break
  234. end
  235. end
  236. end
  237. end
  238. return wrap(lex)
  239. end
  240. local function isstring (s)
  241. return type(s) == 'string'
  242. end
  243. --- insert tokens into a stream.
  244. -- @param tok a token stream
  245. -- @param a1 a string is the type, a table is a token list and
  246. -- a function is assumed to be a token-like iterator (returns type & value)
  247. -- @string a2 a string is the value
  248. function lexer.insert (tok,a1,a2)
  249. if not a1 then return end
  250. local ts
  251. if isstring(a1) and isstring(a2) then
  252. ts = {{a1,a2}}
  253. elseif type(a1) == 'function' then
  254. ts = {}
  255. for t,v in a1() do
  256. append(ts,{t,v})
  257. end
  258. else
  259. ts = a1
  260. end
  261. tok(ts)
  262. end
  263. --- get everything in a stream upto a newline.
  264. -- @param tok a token stream
  265. -- @return a string
  266. function lexer.getline (tok)
  267. local _,v = tok('.-\n')
  268. return v
  269. end
  270. --- get current line number.
  271. -- @param tok a token stream
  272. -- @return the line number.
  273. -- if the input source is a file-like object,
  274. -- also return the column.
  275. function lexer.lineno (tok)
  276. return tok(0)
  277. end
  278. --- get the rest of the stream.
  279. -- @param tok a token stream
  280. -- @return a string
  281. function lexer.getrest (tok)
  282. local _,v = tok('.+')
  283. return v
  284. end
  285. --- get the Lua keywords as a set-like table.
  286. -- So `res["and"]` etc would be `true`.
  287. -- @return a table
  288. function lexer.get_keywords ()
  289. if not lua_keyword then
  290. lua_keyword = {
  291. ["and"] = true, ["break"] = true, ["do"] = true,
  292. ["else"] = true, ["elseif"] = true, ["end"] = true,
  293. ["false"] = true, ["for"] = true, ["function"] = true,
  294. ["if"] = true, ["in"] = true, ["local"] = true, ["nil"] = true,
  295. ["not"] = true, ["or"] = true, ["repeat"] = true,
  296. ["return"] = true, ["then"] = true, ["true"] = true,
  297. ["until"] = true, ["while"] = true
  298. }
  299. end
  300. return lua_keyword
  301. end
  302. --- create a Lua token iterator from a string or file-like object.
  303. -- Will return the token type and value.
  304. -- @string s the string
  305. -- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
  306. -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
  307. -- which means convert numbers and strip string quotes.
  308. function lexer.lua(s,filter,options)
  309. filter = filter or {space=true,comments=true}
  310. lexer.get_keywords()
  311. if not lua_matches then
  312. lua_matches = {
  313. {WSPACE,wsdump},
  314. {NUMBER3,ndump},
  315. {IDEN,lua_vdump},
  316. {NUMBER4,ndump},
  317. {NUMBER5,ndump},
  318. {NUMBER6,ndump},
  319. {NUMBER7,ndump},
  320. {STRING1,sdump},
  321. {STRING2,sdump},
  322. {STRING3,sdump},
  323. {'^%-%-%[(=*)%[.-%]%1%]',cdump},
  324. {'^%-%-[^\n]*',cdump},
  325. {'^%[(=*)%[.-%]%1%]',sdump_l},
  326. {'^==',tdump},
  327. {'^~=',tdump},
  328. {'^<=',tdump},
  329. {'^>=',tdump},
  330. {'^%.%.%.',tdump},
  331. {'^%.%.',tdump},
  332. {'^.',tdump}
  333. }
  334. end
  335. return lexer.scan(s,lua_matches,filter,options)
  336. end
  337. --- create a C/C++ token iterator from a string or file-like object.
  338. -- Will return the token type type and value.
  339. -- @string s the string
  340. -- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
  341. -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
  342. -- which means convert numbers and strip string quotes.
  343. function lexer.cpp(s,filter,options)
  344. filter = filter or {space=true,comments=true}
  345. if not cpp_keyword then
  346. cpp_keyword = {
  347. ["class"] = true, ["break"] = true, ["do"] = true, ["sizeof"] = true,
  348. ["else"] = true, ["continue"] = true, ["struct"] = true,
  349. ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true,
  350. ["private"] = true, ["protected"] = true, ["goto"] = true,
  351. ["if"] = true, ["static"] = true, ["const"] = true, ["typedef"] = true,
  352. ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true,
  353. ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true,
  354. ["double"] = true, ["while"] = true, ["new"] = true,
  355. ["namespace"] = true, ["try"] = true, ["catch"] = true,
  356. ["switch"] = true, ["case"] = true, ["extern"] = true,
  357. ["return"] = true,["default"] = true,['unsigned'] = true,['signed'] = true,
  358. ["union"] = true, ["volatile"] = true, ["register"] = true,["short"] = true,
  359. }
  360. end
  361. if not cpp_matches then
  362. cpp_matches = {
  363. {WSPACE,wsdump},
  364. {PREPRO,pdump},
  365. {NUMBER3,ndump},
  366. {IDEN,cpp_vdump},
  367. {NUMBER4,ndump},
  368. {NUMBER5,ndump},
  369. {CHAR1,chdump},
  370. {CHAR2,chdump},
  371. {CHAR3,chdump},
  372. {STRING1,sdump},
  373. {STRING2,sdump},
  374. {STRING3,sdump},
  375. {'^//.-\n',cdump},
  376. {'^/%*.-%*/',cdump},
  377. {'^==',tdump},
  378. {'^!=',tdump},
  379. {'^<=',tdump},
  380. {'^>=',tdump},
  381. {'^->',tdump},
  382. {'^&&',tdump},
  383. {'^||',tdump},
  384. {'^%+%+',tdump},
  385. {'^%-%-',tdump},
  386. {'^%+=',tdump},
  387. {'^%-=',tdump},
  388. {'^%*=',tdump},
  389. {'^/=',tdump},
  390. {'^|=',tdump},
  391. {'^%^=',tdump},
  392. {'^::',tdump},
  393. {'^.',tdump}
  394. }
  395. end
  396. return lexer.scan(s,cpp_matches,filter,options)
  397. end
  398. --- get a list of parameters separated by a delimiter from a stream.
  399. -- @param tok the token stream
  400. -- @string[opt=')'] endtoken end of list. Can be '\n'
  401. -- @string[opt=','] delim separator
  402. -- @return a list of token lists.
  403. function lexer.get_separated_list(tok,endtoken,delim)
  404. endtoken = endtoken or ')'
  405. delim = delim or ','
  406. local parm_values = {}
  407. local level = 1 -- used to count ( and )
  408. local tl = {}
  409. local function tappend (tl,t,val)
  410. val = val or t
  411. append(tl,{t,val})
  412. end
  413. local is_end
  414. if endtoken == '\n' then
  415. is_end = function(t,val)
  416. return t == 'space' and val:find '\n'
  417. end
  418. else
  419. is_end = function (t)
  420. return t == endtoken
  421. end
  422. end
  423. local token,value
  424. while true do
  425. token,value=tok()
  426. if not token then return nil,'EOS' end -- end of stream is an error!
  427. if is_end(token,value) and level == 1 then
  428. append(parm_values,tl)
  429. break
  430. elseif token == '(' then
  431. level = level + 1
  432. tappend(tl,'(')
  433. elseif token == ')' then
  434. level = level - 1
  435. if level == 0 then -- finished with parm list
  436. append(parm_values,tl)
  437. break
  438. else
  439. tappend(tl,')')
  440. end
  441. elseif token == delim and level == 1 then
  442. append(parm_values,tl) -- a new parm
  443. tl = {}
  444. else
  445. tappend(tl,token,value)
  446. end
  447. end
  448. return parm_values,{token,value}
  449. end
  450. --- get the next non-space token from the stream.
  451. -- @param tok the token stream.
  452. function lexer.skipws (tok)
  453. local t,v = tok()
  454. while t == 'space' do
  455. t,v = tok()
  456. end
  457. return t,v
  458. end
  459. local skipws = lexer.skipws
  460. --- get the next token, which must be of the expected type.
  461. -- Throws an error if this type does not match!
  462. -- @param tok the token stream
  463. -- @string expected_type the token type
  464. -- @bool no_skip_ws whether we should skip whitespace
  465. function lexer.expecting (tok,expected_type,no_skip_ws)
  466. assert_arg(1,tok,'function')
  467. assert_arg(2,expected_type,'string')
  468. local t,v
  469. if no_skip_ws then
  470. t,v = tok()
  471. else
  472. t,v = skipws(tok)
  473. end
  474. if t ~= expected_type then error ("expecting "..expected_type,2) end
  475. return v
  476. end
  477. return lexer