123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312 |
- --[[ Copyright (c) 2009 Peter "Corsix" Cawley
- Permission is hereby granted, free of charge, to any person obtaining a copy of
- this software and associated documentation files (the "Software"), to deal in
- the Software without restriction, including without limitation the rights to
- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- of the Software, and to permit persons to whom the Software is furnished to do
- so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE. --]]
- -- this C parser was taken from Corsix-TH, I'm sure this could be done much
- -- better (i.e.: I think everything I do could be substitutions made with LPeg
- -- during parsing), but I've just learned enough basic LPeg to make this
- -- work.
- -- see: http://lua-users.org/wiki/LpegRecipes
- local lpeg = require 'lpeg'
- local C, P, R, S, V = lpeg.C, lpeg.P, lpeg.R, lpeg.S, lpeg.V
- local Carg, Cc, Cp, Ct = lpeg.Carg, lpeg.Cc, lpeg.Cp, lpeg.Ct
- local tokens = P {
- 'tokens',
- -- Comment of form /* ... */
- comment = Ct(P '/*' * C((V 'newline' + (1 - P '*/')) ^ 0) * P '*/' * Cc 'comment'),
- -- Single line comment
- line_comment = Ct(P '//' * C((1 - V 'newline') ^ 0) * Cc 'comment_line'),
- -- Single platform independent line break which increments line number
- newline = (P '\r\n' + P '\n\r' + S '\r\n') * (Cp() * Carg(1)) / function(pos, state)
- state.line = state.line + 1
- state.line_start = pos
- end,
- -- Line continuation
- line_extend = Ct(C(P [[\]] * V 'newline') * Cc 'line_extend'),
- -- Whitespace of any length (includes newlines)
- whitespace = Ct(C((S ' \t' + V 'newline') ^ 1) * Cc 'whitespace'),
- -- Special form of #include with filename followed in angled brackets (matches 3 tokens)
- include = Ct(C(P '#include') * Cc 'preprocessor') * Ct(C(S ' \t' ^ 1) * Cc 'whitespace') * Ct(
- C(P '<' * (1 - P '>') ^ 1 * P '>') * Cc 'string'
- ),
- -- Preprocessor instruction
- preprocessor = V 'include'
- + Ct(
- C(
- P '#'
- * P ' ' ^ 0
- * (P 'define' + P 'elif' + P 'else' + P 'endif' + P '#' + P 'error' + P 'ifdef' + P 'ifndef' + P 'if' + P 'import' + P 'include' + P 'line' + P 'pragma' + P 'undef' + P 'using' + P 'pragma')
- * #S ' \r\n\t'
- ) * Cc 'preprocessor'
- ),
- -- Identifier of form [a-zA-Z_][a-zA-Z0-9_]*
- identifier = Ct(C(R('az', 'AZ', '__') * R('09', 'az', 'AZ', '__') ^ 0) * Cc 'identifier'),
- -- Single character in a string
- sstring_char = R('\001&', '([', ']\255') + (P '\\' * S [[ntvbrfa\?'"0x]]),
- dstring_char = R('\001!', '#[', ']\255') + (P '\\' * S [[ntvbrfa\?'"0x]]),
- -- String literal
- string = Ct(
- C(
- P "'" * (V 'sstring_char' + P '"') ^ 0 * P "'"
- + P '"' * (V 'dstring_char' + P "'") ^ 0 * P '"'
- ) * Cc 'string'
- ),
- -- Operator
- operator = Ct(
- C(
- P '>>='
- + P '<<='
- + P '...'
- + P '::'
- + P '<<'
- + P '>>'
- + P '<='
- + P '>='
- + P '=='
- + P '!='
- + P '||'
- + P '&&'
- + P '++'
- + P '--'
- + P '->'
- + P '+='
- + P '-='
- + P '*='
- + P '/='
- + P '|='
- + P '&='
- + P '^='
- + S '+-*/=<>%^|&.?:!~,'
- ) * Cc 'operator'
- ),
- -- Misc. char (token type is the character itself)
- char = Ct(C(S '[]{}();') / function(x)
- return x, x
- end),
- -- Hex, octal or decimal number
- int = Ct(
- C((P '0x' * R('09', 'af', 'AF') ^ 1) + (P '0' * R '07' ^ 0) + R '09' ^ 1) * Cc 'integer'
- ),
- -- Floating point number
- f_exponent = S 'eE' + S '+-' ^ -1 * R '09' ^ 1,
- f_terminator = S 'fFlL',
- float = Ct(
- C(
- R '09' ^ 1 * V 'f_exponent' * V 'f_terminator' ^ -1
- + R '09' ^ 0 * P '.' * R '09' ^ 1 * V 'f_exponent' ^ -1 * V 'f_terminator' ^ -1
- + R '09' ^ 1 * P '.' * R '09' ^ 0 * V 'f_exponent' ^ -1 * V 'f_terminator' ^ -1
- ) * Cc 'float'
- ),
- -- Any token
- token = V 'comment'
- + V 'line_comment'
- + V 'identifier'
- + V 'whitespace'
- + V 'line_extend'
- + V 'preprocessor'
- + V 'string'
- + V 'char'
- + V 'operator'
- + V 'float'
- + V 'int',
- -- Error for when nothing else matches
- error = (Cp() * C(P(1) ^ -8) * Carg(1)) / function(pos, where, state)
- error(
- ("Tokenising error on line %i, position %i, near '%s'"):format(
- state.line,
- pos - state.line_start + 1,
- where
- )
- )
- end,
- -- Match end of input or throw error
- finish = -P(1) + V 'error',
- -- Match stream of tokens into a table
- tokens = Ct(V 'token' ^ 0) * V 'finish',
- }
- local function TokeniseC(str)
- return tokens:match(str, 1, { line = 1, line_start = 1 })
- end
- local function set(t)
- local s = {}
- for _, v in ipairs(t) do
- s[v] = true
- end
- return s
- end
- local C_keywords = set { -- luacheck: ignore
- 'break',
- 'case',
- 'char',
- 'const',
- 'continue',
- 'default',
- 'do',
- 'double',
- 'else',
- 'enum',
- 'extern',
- 'float',
- 'for',
- 'goto',
- 'if',
- 'int',
- 'long',
- 'register',
- 'return',
- 'short',
- 'signed',
- 'sizeof',
- 'static',
- 'struct',
- 'switch',
- 'typedef',
- 'union',
- 'unsigned',
- 'void',
- 'volatile',
- 'while',
- }
- -- Very primitive C formatter that tries to put "things" inside braces on one
- -- line. This is a step done after preprocessing the C source to ensure that
- -- the duplicate line detector can more reliably pick out identical declarations.
- --
- -- an example:
- -- struct mystruct
- -- {
- -- int a;
- -- int b;
- -- };
- --
- -- would become:
- -- struct mystruct { int a; int b; };
- --
- -- The first one will have a lot of false positives (the line '{' for
- -- example), the second one is more unique.
- --- @param string
- --- @return string
- local function formatc(str)
- local toks = TokeniseC(str)
- local result = {}
- local block_level = 0
- local allow_one_nl = false
- local end_at_brace = false
- for _, token in ipairs(toks) do
- local typ = token[2]
- if typ == '{' then
- block_level = block_level + 1
- elseif typ == '}' then
- block_level = block_level - 1
- if block_level == 0 and end_at_brace then
- -- if we're not inside a block, we're at the basic statement level,
- -- and ';' indicates we're at the end of a statement, so we put end
- -- it with a newline.
- token[1] = token[1] .. '\n'
- end_at_brace = false
- end
- elseif typ == 'identifier' then
- -- static and/or inline usually indicate an inline header function,
- -- which has no trailing ';', so we have to add a newline after the
- -- '}' ourselves.
- local tok = token[1]
- if tok == 'static' or tok == 'inline' or tok == '__inline' then
- end_at_brace = true
- end
- elseif typ == 'preprocessor' then
- -- preprocessor directives don't end in ';' but need their newline, so
- -- we're going to allow the next newline to pass.
- allow_one_nl = true
- elseif typ == ';' then
- if block_level == 0 then
- -- if we're not inside a block, we're at the basic statement level,
- -- and ';' indicates we're at the end of a statement, so we put end
- -- it with a newline.
- token[1] = ';\n'
- end_at_brace = false
- end
- elseif typ == 'whitespace' then
- -- replace all whitespace by one space
- local repl = ' '
- -- except when allow_on_nl is true and there's a newline in the whitespace
- if string.find(token[1], '[\r\n]+') and allow_one_nl == true then
- -- in that case we replace all whitespace by one newline
- repl = '\n'
- allow_one_nl = false
- end
- token[1] = string.gsub(token[1], '%s+', repl)
- end
- result[#result + 1] = token[1]
- end
- return table.concat(result)
- end
- -- standalone operation (very handy for debugging)
- local function standalone(...) -- luacheck: ignore
- local Preprocess = require('preprocess')
- Preprocess.add_to_include_path('./../../src')
- Preprocess.add_to_include_path('./../../build/include')
- Preprocess.add_to_include_path('./../../.deps/usr/include')
- local raw = Preprocess.preprocess('', arg[1])
- local formatted
- if #arg == 2 and arg[2] == 'no' then
- formatted = raw
- else
- formatted = formatc(raw)
- end
- print(formatted)
- end
- -- uncomment this line (and comment the `return`) for standalone debugging
- -- example usage:
- -- ../../.deps/usr/bin/luajit formatc.lua ../../include/fileio.h.generated.h
- -- ../../.deps/usr/bin/luajit formatc.lua /usr/include/malloc.h
- -- standalone(...)
- return formatc
|