123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- --[[ Copyright (c) 2009 Peter "Corsix" Cawley
- Permission is hereby granted, free of charge, to any person obtaining a copy of
- this software and associated documentation files (the "Software"), to deal in
- the Software without restriction, including without limitation the rights to
- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- of the Software, and to permit persons to whom the Software is furnished to do
- so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE. --]]
- -- this C parser was taken from Corsix-TH, I'm sure this could be done much
- -- better (i.e.: I think everything I do could be substitutions made with LPeg
- -- during parsing), but I've just learned enough basic LPeg to make this
- -- work.
- -- see: http://lua-users.org/wiki/LpegRecipes
- local lpeg = require "lpeg"
- local C, P, R, S, V = lpeg.C, lpeg.P, lpeg.R, lpeg.S, lpeg.V
- local Carg, Cc, Cp, Ct = lpeg.Carg, lpeg.Cc, lpeg.Cp, lpeg.Ct
- local tokens = P { "tokens";
- -- Comment of form /* ... */
- comment = Ct(P"/*" * C((V"newline" + (1 - P"*/"))^0) * P"*/" * Cc"comment"),
- -- Single line comment
- line_comment = Ct(P"//" * C((1 - V"newline")^0) * Cc"comment_line"),
- -- Single platform independent line break which increments line number
- newline = (P"\r\n" + P"\n\r" + S"\r\n") * (Cp() * Carg(1)) / function(pos, state)
- state.line = state.line + 1
- state.line_start = pos
- end,
- -- Line continuation
- line_extend = Ct(C(P[[\]] * V"newline") * Cc"line_extend"),
- -- Whitespace of any length (includes newlines)
- whitespace = Ct(C((S" \t" + V"newline")^1) * Cc"whitespace"),
- -- Special form of #include with filename followed in angled brackets (matches 3 tokens)
- include = Ct(C(P"#include") * Cc"preprocessor") *
- Ct(C(S" \t"^1) * Cc"whitespace") *
- Ct(C(P"<" * (1 - P">")^1 * P">") * Cc"string"),
- -- Preprocessor instruction
- preprocessor = V"include" +
- Ct(C(P"#" * P" "^0 * ( P"define" + P"elif" + P"else" + P"endif" + P"#" +
- P"error" + P"ifdef" + P"ifndef" + P"if" + P"import" +
- P"include" + P"line" + P"pragma" + P"undef" + P"using" +
- P"pragma"
- ) * #S" \r\n\t") * Cc"preprocessor"),
- -- Identifier of form [a-zA-Z_][a-zA-Z0-9_]*
- identifier = Ct(C(R("az","AZ","__") * R("09","az","AZ","__")^0) * Cc"identifier"),
- -- Single character in a string
- sstring_char = R("\001&","([","]\255") + (P"\\" * S[[ntvbrfa\?'"0x]]),
- dstring_char = R("\001!","#[","]\255") + (P"\\" * S[[ntvbrfa\?'"0x]]),
- -- String literal
- string = Ct(C(P"'" * (V"sstring_char" + P'"')^0 * P"'" +
- P'"' * (V"dstring_char" + P"'")^0 * P'"') * Cc"string"),
- -- Operator
- operator = Ct(C(P">>=" + P"<<=" + P"..." +
- P"::" + P"<<" + P">>" + P"<=" + P">=" + P"==" + P"!=" +
- P"||" + P"&&" + P"++" + P"--" + P"->" + P"+=" + P"-=" +
- P"*=" + P"/=" + P"|=" + P"&=" + P"^=" + S"+-*/=<>%^|&.?:!~,") * Cc"operator"),
- -- Misc. char (token type is the character itself)
- char = Ct(C(S"[]{}();") / function(x) return x, x end),
- -- Hex, octal or decimal number
- int = Ct(C((P"0x" * R("09","af","AF")^1) + (P"0" * R"07"^0) + R"09"^1) * Cc"integer"),
- -- Floating point number
- f_exponent = S"eE" + S"+-"^-1 * R"09"^1,
- f_terminator = S"fFlL",
- float = Ct(C(
- R"09"^1 * V"f_exponent" * V"f_terminator"^-1 +
- R"09"^0 * P"." * R"09"^1 * V"f_exponent"^-1 * V"f_terminator"^-1 +
- R"09"^1 * P"." * R"09"^0 * V"f_exponent"^-1 * V"f_terminator"^-1
- ) * Cc"float"),
- -- Any token
- token = V"comment" +
- V"line_comment" +
- V"identifier" +
- V"whitespace" +
- V"line_extend" +
- V"preprocessor" +
- V"string" +
- V"char" +
- V"operator" +
- V"float" +
- V"int",
- -- Error for when nothing else matches
- error = (Cp() * C(P(1) ^ -8) * Carg(1)) / function(pos, where, state)
- error(("Tokenising error on line %i, position %i, near '%s'")
- :format(state.line, pos - state.line_start + 1, where))
- end,
- -- Match end of input or throw error
- finish = -P(1) + V"error",
- -- Match stream of tokens into a table
- tokens = Ct(V"token" ^ 0) * V"finish",
- }
- local function TokeniseC(str)
- return tokens:match(str, 1, {line = 1, line_start = 1})
- end
- local function set(t)
- local s = {}
- for _, v in ipairs(t) do
- s[v] = true
- end
- return s
- end
- local C_keywords = set { -- luacheck: ignore
- "break", "case", "char", "const", "continue", "default", "do", "double",
- "else", "enum", "extern", "float", "for", "goto", "if", "int", "long",
- "register", "return", "short", "signed", "sizeof", "static", "struct",
- "switch", "typedef", "union", "unsigned", "void", "volatile", "while",
- }
- -- Very primitive C formatter that tries to put "things" inside braces on one
- -- line. This is a step done after preprocessing the C source to ensure that
- -- the duplicate line detecter can more reliably pick out identical declarations.
- --
- -- an example:
- -- struct mystruct
- -- {
- -- int a;
- -- int b;
- -- };
- --
- -- would become:
- -- struct mystruct { int a; int b; };
- --
- -- The first one will have a lot of false positives (the line '{' for
- -- example), the second one is more unique.
- local function formatc(str)
- local toks = TokeniseC(str)
- local result = {}
- local block_level = 0
- local allow_one_nl = false
- local end_at_brace = false
- for _, token in ipairs(toks) do
- local typ = token[2]
- if typ == '{' then
- block_level = block_level + 1
- elseif typ == '}' then
- block_level = block_level - 1
- if block_level == 0 and end_at_brace then
- -- if we're not inside a block, we're at the basic statement level,
- -- and ';' indicates we're at the end of a statement, so we put end
- -- it with a newline.
- token[1] = token[1] .. "\n"
- end_at_brace = false
- end
- elseif typ == 'identifier' then
- -- static and/or inline usually indicate an inline header function,
- -- which has no trailing ';', so we have to add a newline after the
- -- '}' ourselves.
- local tok = token[1]
- if tok == 'static' or tok == 'inline' or tok == '__inline' then
- end_at_brace = true
- end
- elseif typ == 'preprocessor' then
- -- preprocessor directives don't end in ';' but need their newline, so
- -- we're going to allow the next newline to pass.
- allow_one_nl = true
- elseif typ == ';' then
- if block_level == 0 then
- -- if we're not inside a block, we're at the basic statement level,
- -- and ';' indicates we're at the end of a statement, so we put end
- -- it with a newline.
- token[1] = ";\n"
- end
- elseif typ == 'whitespace' then
- -- replace all whitespace by one space
- local repl = " "
- -- except when allow_on_nl is true and there's a newline in the whitespace
- if string.find(token[1], "[\r\n]+") and allow_one_nl == true then
- -- in that case we replace all whitespace by one newline
- repl = "\n"
- allow_one_nl = false
- end
- token[1] = string.gsub(token[1], "%s+", repl)
- end
- result[#result + 1] = token[1]
- end
- return table.concat(result)
- end
- -- standalone operation (very handy for debugging)
- local function standalone(...) -- luacheck: ignore
- local Preprocess = require("preprocess")
- Preprocess.add_to_include_path('./../../src')
- Preprocess.add_to_include_path('./../../build/include')
- Preprocess.add_to_include_path('./../../.deps/usr/include')
- local raw = Preprocess.preprocess('', arg[1])
- local formatted
- if #arg == 2 and arg[2] == 'no' then
- formatted = raw
- else
- formatted = formatc(raw)
- end
- print(formatted)
- end
- -- uncomment this line (and comment the `return`) for standalone debugging
- -- example usage:
- -- ../../.deps/usr/bin/luajit formatc.lua ../../include/fileio.h.generated.h
- -- ../../.deps/usr/bin/luajit formatc.lua /usr/include/malloc.h
- -- standalone(...)
- return formatc
|