123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518 |
- --lexer.lua is part of Penlight http://stevedonovan.github.com/Penlight/
- --[[
- Copyright (C) 2009 Steve Donovan, David Manura.
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
- associated documentation files (the "Software"), to deal in the Software without restriction,
- including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
- and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
- subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
- FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ]]
- -- NOTE: This version is modified to handle love2d calls separately and has a few other tweaks
- --- Lexical scanner for creating a sequence of tokens from text.
- -- `lexer.scan(s)` returns an iterator over all tokens found in the
- -- string `s`. This iterator returns two values, a token type string
- -- (such as 'string' for quoted string, 'iden' for identifier) and the value of the
- -- token.
- --
- -- Versions specialized for Lua and C are available; these also handle block comments
- -- and classify keywords as 'keyword' tokens. For example:
- --
- -- > s = 'for i=1,n do'
- -- > for t,v in lexer.lua(s) do print(t,v) end
- -- keyword for
- -- iden i
- -- = =
- -- number 1
- -- , ,
- -- iden n
- -- keyword do
- --
- -- See the Guide for further @{06-data.md.Lexical_Scanning|discussion}
- -- @module pl.lexer
- local yield,wrap = coroutine.yield,coroutine.wrap
- local strfind = string.find
- local strsub = string.sub
- local append = table.insert
- local function assert_arg(idx,val,tp)
- if type(val) ~= tp then
- error("argument "..idx.." must be "..tp, 2)
- end
- end
- local lexer = {}
- local NUMBER1 = '^[+%-]?%d+%.?%d*[eE][+%-]?%d+'
- local NUMBER2 = '^[+%-]?%d+%.?%d*'
- local NUMBER3 = '^0[xX]%x+'
- local NUMBER4 = '^%.%d+[eE][+%-]?%d+' -- e.g. .123E(+)768
- local NUMBER5 = '^%.%d+'
- local NUMBER6 = '^%d+%.?%d*[eE][+%-]?%d+' -- e.g. 123(.)(456)E(+)768
- local NUMBER7 = '^%d+%.?%d*'
- local IDEN = '^[%a_][%w_.]*'
- local WSPACE = '^%s+'
- local STRING1 = "^(['\"])%1" -- empty string
- local STRING2 = [[^(['"])(\*)%2%1]]
- local STRING3 = [[^(['"]).-[^\](\*)%2%1]]
- local CHAR1 = "^''"
- local CHAR2 = [[^'(\*)%1']]
- local CHAR3 = [[^'.-[^\](\*)%1']]
- local PREPRO = '^#.-[^\\]\n'
- local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword
- local function tdump(tok)
- return yield(tok,tok)
- end
- local function ndump(tok,options)
- if options and options.number then
- tok = tonumber(tok)
- end
- return yield("number",tok)
- end
- -- regular strings, single or double quotes; usually we want them
- -- without the quotes
- local function sdump(tok,options)
- if options and options.string then
- tok = tok:sub(2,-2)
- end
- return yield("string",tok)
- end
- -- long Lua strings need extra work to get rid of the quotes
- local function sdump_l(tok,options,findres)
- if options and options.string then
- local quotelen = 3
- if findres[3] then
- quotelen = quotelen + findres[3]:len()
- end
- tok = tok:sub(quotelen, -quotelen)
- if tok:sub(1, 1) == "\n" then
- tok = tok:sub(2)
- end
- end
- return yield("string",tok)
- end
- local function chdump(tok,options)
- if options and options.string then
- tok = tok:sub(2,-2)
- end
- return yield("char",tok)
- end
- local function cdump(tok)
- return yield('comment',tok)
- end
- local function wsdump (tok)
- return yield("space",tok)
- end
- local function pdump (tok)
- return yield('prepro',tok)
- end
- local function plain_vdump(tok)
- return yield("iden",tok)
- end
- local IDENLOVE = '^(love[%w_.]*%.)([%a_][%w_]*)$'
- local function lua_vdump(tok)
- if lua_keyword[tok] then
- return yield("keyword",tok)
- end
- local m, n = string.match(tok, IDENLOVE)
- if m then
- return yield("idenlovepre", m), yield("idenlove", n)
- end
- return yield("iden",tok)
- end
- local function cpp_vdump(tok)
- if cpp_keyword[tok] then
- return yield("keyword",tok)
- else
- return yield("iden",tok)
- end
- end
- --- create a plain token iterator from a string or file-like object.
- -- @tparam string|file s a string or a file-like object with `:read()` method returning lines.
- -- @tab matches an optional match table - array of token descriptions.
- -- A token is described by a `{pattern, action}` pair, where `pattern` should match
- -- token body and `action` is a function called when a token of described type is found.
- -- @tab[opt] filter a table of token types to exclude, by default `{space=true}`
- -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
- -- which means convert numbers and strip string quotes.
- function lexer.scan(s,matches,filter,options)
- local file = type(s) ~= 'string' and s
- filter = filter or {space=true}
- options = options or {number=true,string=true}
- if filter then
- if filter.space then filter[wsdump] = true end
- if filter.comments then
- filter[cdump] = true
- end
- end
- if not matches then
- if not plain_matches then
- plain_matches = {
- {WSPACE,wsdump},
- {NUMBER3,ndump},
- {IDEN,plain_vdump},
- {NUMBER1,ndump},
- {NUMBER2,ndump},
- {STRING1,sdump},
- {STRING2,sdump},
- {STRING3,sdump},
- {'^.',tdump}
- }
- end
- matches = plain_matches
- end
- local function lex(first_arg)
- local line_nr = 0
- local next_line = file and file:read()
- local sz = file and 0 or #s
- local idx = 1
- -- res is the value used to resume the coroutine.
- local function handle_requests(res)
- while res do
- local tp = type(res)
- -- insert a token list
- if tp == 'table' then
- res = yield('','')
- for _,t in ipairs(res) do
- res = yield(t[1],t[2])
- end
- elseif tp == 'string' then -- or search up to some special pattern
- local i1,i2 = strfind(s,res,idx)
- if i1 then
- local tok = strsub(s,i1,i2)
- idx = i2 + 1
- res = yield('',tok)
- else
- res = yield('','')
- idx = sz + 1
- end
- else
- res = yield(line_nr,idx)
- end
- end
- end
- handle_requests(first_arg)
- if not file then line_nr = 1 end
- while true do
- if idx > sz then
- if file then
- if not next_line then return end
- s = next_line
- line_nr = line_nr + 1
- next_line = file:read()
- if next_line then
- s = s .. '\n'
- end
- idx, sz = 1, #s
- else
- while true do
- handle_requests(yield())
- end
- end
- end
- for _,m in ipairs(matches) do
- local pat = m[1]
- local fun = m[2]
- local findres = {strfind(s,pat,idx)}
- local i1, i2 = findres[1], findres[2]
- if i1 then
- local tok = strsub(s,i1,i2)
- idx = i2 + 1
- local res
- if not (filter and filter[fun]) then
- lexer.finished = idx > sz
- res = fun(tok, options, findres)
- end
- if not file and tok:find("\n") then
- -- Update line number.
- local _, newlines = tok:gsub("\n", {})
- line_nr = line_nr + newlines
- end
- handle_requests(res)
- break
- end
- end
- end
- end
- return wrap(lex)
- end
- local function isstring (s)
- return type(s) == 'string'
- end
- --- insert tokens into a stream.
- -- @param tok a token stream
- -- @param a1 a string is the type, a table is a token list and
- -- a function is assumed to be a token-like iterator (returns type & value)
- -- @string a2 a string is the value
- function lexer.insert (tok,a1,a2)
- if not a1 then return end
- local ts
- if isstring(a1) and isstring(a2) then
- ts = {{a1,a2}}
- elseif type(a1) == 'function' then
- ts = {}
- for t,v in a1() do
- append(ts,{t,v})
- end
- else
- ts = a1
- end
- tok(ts)
- end
- --- get everything in a stream upto a newline.
- -- @param tok a token stream
- -- @return a string
- function lexer.getline (tok)
- local _,v = tok('.-\n')
- return v
- end
- --- get current line number.
- -- @param tok a token stream
- -- @return the line number.
- -- if the input source is a file-like object,
- -- also return the column.
- function lexer.lineno (tok)
- return tok(0)
- end
- --- get the rest of the stream.
- -- @param tok a token stream
- -- @return a string
- function lexer.getrest (tok)
- local _,v = tok('.+')
- return v
- end
- --- get the Lua keywords as a set-like table.
- -- So `res["and"]` etc would be `true`.
- -- @return a table
- function lexer.get_keywords ()
- if not lua_keyword then
- lua_keyword = {
- ["and"] = true, ["break"] = true, ["do"] = true,
- ["else"] = true, ["elseif"] = true, ["end"] = true,
- ["false"] = true, ["for"] = true, ["function"] = true,
- ["if"] = true, ["in"] = true, ["local"] = true, ["nil"] = true,
- ["not"] = true, ["or"] = true, ["repeat"] = true,
- ["return"] = true, ["then"] = true, ["true"] = true,
- ["until"] = true, ["while"] = true
- }
- end
- return lua_keyword
- end
- --- create a Lua token iterator from a string or file-like object.
- -- Will return the token type and value.
- -- @string s the string
- -- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
- -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
- -- which means convert numbers and strip string quotes.
- function lexer.lua(s,filter,options)
- filter = filter or {space=true,comments=true}
- lexer.get_keywords()
- if not lua_matches then
- lua_matches = {
- {WSPACE,wsdump},
- {NUMBER3,ndump},
- {IDEN,lua_vdump},
- {NUMBER4,ndump},
- {NUMBER5,ndump},
- {NUMBER6,ndump},
- {NUMBER7,ndump},
- {STRING1,sdump},
- {STRING2,sdump},
- {STRING3,sdump},
- {'^%-%-%[(=*)%[.-%]%1%]',cdump},
- {'^%-%-[^\n]*',cdump},
- {'^%[(=*)%[.-%]%1%]',sdump_l},
- {'^==',tdump},
- {'^~=',tdump},
- {'^<=',tdump},
- {'^>=',tdump},
- {'^%.%.%.',tdump},
- {'^%.%.',tdump},
- {'^.',tdump}
- }
- end
- return lexer.scan(s,lua_matches,filter,options)
- end
- --- create a C/C++ token iterator from a string or file-like object.
- -- Will return the token type type and value.
- -- @string s the string
- -- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
- -- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
- -- which means convert numbers and strip string quotes.
- function lexer.cpp(s,filter,options)
- filter = filter or {space=true,comments=true}
- if not cpp_keyword then
- cpp_keyword = {
- ["class"] = true, ["break"] = true, ["do"] = true, ["sizeof"] = true,
- ["else"] = true, ["continue"] = true, ["struct"] = true,
- ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true,
- ["private"] = true, ["protected"] = true, ["goto"] = true,
- ["if"] = true, ["static"] = true, ["const"] = true, ["typedef"] = true,
- ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true,
- ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true,
- ["double"] = true, ["while"] = true, ["new"] = true,
- ["namespace"] = true, ["try"] = true, ["catch"] = true,
- ["switch"] = true, ["case"] = true, ["extern"] = true,
- ["return"] = true,["default"] = true,['unsigned'] = true,['signed'] = true,
- ["union"] = true, ["volatile"] = true, ["register"] = true,["short"] = true,
- }
- end
- if not cpp_matches then
- cpp_matches = {
- {WSPACE,wsdump},
- {PREPRO,pdump},
- {NUMBER3,ndump},
- {IDEN,cpp_vdump},
- {NUMBER4,ndump},
- {NUMBER5,ndump},
- {CHAR1,chdump},
- {CHAR2,chdump},
- {CHAR3,chdump},
- {STRING1,sdump},
- {STRING2,sdump},
- {STRING3,sdump},
- {'^//.-\n',cdump},
- {'^/%*.-%*/',cdump},
- {'^==',tdump},
- {'^!=',tdump},
- {'^<=',tdump},
- {'^>=',tdump},
- {'^->',tdump},
- {'^&&',tdump},
- {'^||',tdump},
- {'^%+%+',tdump},
- {'^%-%-',tdump},
- {'^%+=',tdump},
- {'^%-=',tdump},
- {'^%*=',tdump},
- {'^/=',tdump},
- {'^|=',tdump},
- {'^%^=',tdump},
- {'^::',tdump},
- {'^.',tdump}
- }
- end
- return lexer.scan(s,cpp_matches,filter,options)
- end
- --- get a list of parameters separated by a delimiter from a stream.
- -- @param tok the token stream
- -- @string[opt=')'] endtoken end of list. Can be '\n'
- -- @string[opt=','] delim separator
- -- @return a list of token lists.
- function lexer.get_separated_list(tok,endtoken,delim)
- endtoken = endtoken or ')'
- delim = delim or ','
- local parm_values = {}
- local level = 1 -- used to count ( and )
- local tl = {}
- local function tappend (tl,t,val)
- val = val or t
- append(tl,{t,val})
- end
- local is_end
- if endtoken == '\n' then
- is_end = function(t,val)
- return t == 'space' and val:find '\n'
- end
- else
- is_end = function (t)
- return t == endtoken
- end
- end
- local token,value
- while true do
- token,value=tok()
- if not token then return nil,'EOS' end -- end of stream is an error!
- if is_end(token,value) and level == 1 then
- append(parm_values,tl)
- break
- elseif token == '(' then
- level = level + 1
- tappend(tl,'(')
- elseif token == ')' then
- level = level - 1
- if level == 0 then -- finished with parm list
- append(parm_values,tl)
- break
- else
- tappend(tl,')')
- end
- elseif token == delim and level == 1 then
- append(parm_values,tl) -- a new parm
- tl = {}
- else
- tappend(tl,token,value)
- end
- end
- return parm_values,{token,value}
- end
- --- get the next non-space token from the stream.
- -- @param tok the token stream.
- function lexer.skipws (tok)
- local t,v = tok()
- while t == 'space' do
- t,v = tok()
- end
- return t,v
- end
- local skipws = lexer.skipws
- --- get the next token, which must be of the expected type.
- -- Throws an error if this type does not match!
- -- @param tok the token stream
- -- @string expected_type the token type
- -- @bool no_skip_ws whether we should skip whitespace
- function lexer.expecting (tok,expected_type,no_skip_ws)
- assert_arg(1,tok,'function')
- assert_arg(2,expected_type,'string')
- local t,v
- if no_skip_ws then
- t,v = tok()
- else
- t,v = skipws(tok)
- end
- if t ~= expected_type then error ("expecting "..expected_type,2) end
- return v
- end
- return lexer
|