as_tokenizer.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. /*
  2. AngelCode Scripting Library
  3. Copyright (c) 2003-2015 Andreas Jonsson
  4. This software is provided 'as-is', without any express or implied
  5. warranty. In no event will the authors be held liable for any
  6. damages arising from the use of this software.
  7. Permission is granted to anyone to use this software for any
  8. purpose, including commercial applications, and to alter it and
  9. redistribute it freely, subject to the following restrictions:
  10. 1. The origin of this software must not be misrepresented; you
  11. must not claim that you wrote the original software. If you use
  12. this software in a product, an acknowledgment in the product
  13. documentation would be appreciated but is not required.
  14. 2. Altered source versions must be plainly marked as such, and
  15. must not be misrepresented as being the original software.
  16. 3. This notice may not be removed or altered from any source
  17. distribution.
  18. The original version of this library can be located at:
  19. http://www.angelcode.com/angelscript/
  20. Andreas Jonsson
  21. andreas@angelcode.com
  22. */
  23. //
  24. // as_tokenizer.cpp
  25. //
  26. // This class identifies tokens from the script code
  27. //
  28. #include "as_config.h"
  29. #include "as_scriptengine.h"
  30. #include "as_tokenizer.h"
  31. #include "as_tokendef.h"
  32. #if !defined(AS_NO_MEMORY_H)
  33. #include <memory.h>
  34. #endif
  35. #include <string.h> // strcmp()
  36. BEGIN_AS_NAMESPACE
  37. asCTokenizer::asCTokenizer()
  38. {
  39. engine = 0;
  40. memset(keywordTable, 0, sizeof(keywordTable));
  41. // Initialize the jump table
  42. for( asUINT n = 0; n < numTokenWords; n++ )
  43. {
  44. const sTokenWord& current = tokenWords[n];
  45. unsigned char start = current.word[0];
  46. // Create new jump table entry if none exists
  47. if( !keywordTable[start] )
  48. {
  49. // Surely there won't ever be more than 32 keywords starting with
  50. // the same character. Right?
  51. keywordTable[start] = asNEWARRAY(const sTokenWord*, 32);
  52. memset(keywordTable[start], 0, sizeof(sTokenWord*)*32);
  53. }
  54. // Add the token sorted from longest to shortest so
  55. // we check keywords greedily.
  56. const sTokenWord** tok = keywordTable[start];
  57. unsigned insert = 0, index = 0;
  58. while( tok[index] )
  59. {
  60. if(tok[index]->wordLength >= current.wordLength)
  61. ++insert;
  62. ++index;
  63. }
  64. while( index > insert )
  65. {
  66. tok[index] = tok[index - 1];
  67. --index;
  68. }
  69. tok[insert] = &current;
  70. }
  71. }
  72. asCTokenizer::~asCTokenizer()
  73. {
  74. // Deallocate the jump table
  75. for( asUINT n = 0; n < 256; n++ )
  76. {
  77. if( keywordTable[n] )
  78. asDELETEARRAY(keywordTable[n]);
  79. }
  80. }
  81. // static
  82. const char *asCTokenizer::GetDefinition(int tokenType)
  83. {
  84. if( tokenType == ttUnrecognizedToken ) return "<unrecognized token>";
  85. if( tokenType == ttEnd ) return "<end of file>";
  86. if( tokenType == ttWhiteSpace ) return "<white space>";
  87. if( tokenType == ttOnelineComment ) return "<one line comment>";
  88. if( tokenType == ttMultilineComment ) return "<multiple lines comment>";
  89. if( tokenType == ttIdentifier ) return "<identifier>";
  90. if( tokenType == ttIntConstant ) return "<integer constant>";
  91. if( tokenType == ttFloatConstant ) return "<float constant>";
  92. if( tokenType == ttDoubleConstant ) return "<double constant>";
  93. if( tokenType == ttStringConstant ) return "<string constant>";
  94. if( tokenType == ttMultilineStringConstant ) return "<multiline string constant>";
  95. if( tokenType == ttNonTerminatedStringConstant ) return "<nonterminated string constant>";
  96. if( tokenType == ttBitsConstant ) return "<bits constant>";
  97. if( tokenType == ttHeredocStringConstant ) return "<heredoc string constant>";
  98. for( asUINT n = 0; n < numTokenWords; n++ )
  99. if( tokenWords[n].tokenType == tokenType )
  100. return tokenWords[n].word;
  101. return 0;
  102. }
  103. bool asCTokenizer::IsDigitInRadix(char ch, int radix) const
  104. {
  105. if( ch >= '0' && ch <= '9' ) return (ch -= '0') < radix;
  106. if( ch >= 'A' && ch <= 'Z' ) return (ch -= 'A'-10) < radix;
  107. if( ch >= 'a' && ch <= 'z' ) return (ch -= 'a'-10) < radix;
  108. return false;
  109. }
  110. eTokenType asCTokenizer::GetToken(const char *source, size_t sourceLength, size_t *tokenLength, asETokenClass *tc) const
  111. {
  112. asASSERT(source != 0);
  113. asASSERT(tokenLength != 0);
  114. eTokenType tokenType;
  115. size_t tlen;
  116. asETokenClass t = ParseToken(source, sourceLength, tlen, tokenType);
  117. if( tc ) *tc = t;
  118. if( tokenLength ) *tokenLength = tlen;
  119. return tokenType;
  120. }
  121. asETokenClass asCTokenizer::ParseToken(const char *source, size_t sourceLength, size_t &tokenLength, eTokenType &tokenType) const
  122. {
  123. if( IsWhiteSpace(source, sourceLength, tokenLength, tokenType) ) return asTC_WHITESPACE;
  124. if( IsComment(source, sourceLength, tokenLength, tokenType) ) return asTC_COMMENT;
  125. if( IsConstant(source, sourceLength, tokenLength, tokenType) ) return asTC_VALUE;
  126. if( IsIdentifier(source, sourceLength, tokenLength, tokenType) ) return asTC_IDENTIFIER;
  127. if( IsKeyWord(source, sourceLength, tokenLength, tokenType) ) return asTC_KEYWORD;
  128. // If none of the above this is an unrecognized token
  129. // We can find the length of the token by advancing
  130. // one step and trying to identify a token there
  131. tokenType = ttUnrecognizedToken;
  132. tokenLength = 1;
  133. return asTC_UNKNOWN;
  134. }
  135. bool asCTokenizer::IsWhiteSpace(const char *source, size_t sourceLength, size_t &tokenLength, eTokenType &tokenType) const
  136. {
  137. // Treat UTF8 byte-order-mark (EF BB BF) as whitespace
  138. if( sourceLength >= 3 &&
  139. asBYTE(source[0]) == 0xEFu &&
  140. asBYTE(source[1]) == 0xBBu &&
  141. asBYTE(source[2]) == 0xBFu )
  142. {
  143. tokenType = ttWhiteSpace;
  144. tokenLength = 3;
  145. return true;
  146. }
  147. // Group all other white space characters into one
  148. size_t n;
  149. int numWsChars = (int)strlen(whiteSpace);
  150. for( n = 0; n < sourceLength; n++ )
  151. {
  152. bool isWhiteSpace = false;
  153. for( int w = 0; w < numWsChars; w++ )
  154. {
  155. if( source[n] == whiteSpace[w] )
  156. {
  157. isWhiteSpace = true;
  158. break;
  159. }
  160. }
  161. if( !isWhiteSpace ) break;
  162. }
  163. if( n > 0 )
  164. {
  165. tokenType = ttWhiteSpace;
  166. tokenLength = n;
  167. return true;
  168. }
  169. return false;
  170. }
  171. bool asCTokenizer::IsComment(const char *source, size_t sourceLength, size_t &tokenLength, eTokenType &tokenType) const
  172. {
  173. if( sourceLength < 2 )
  174. return false;
  175. if( source[0] != '/' )
  176. return false;
  177. if( source[1] == '/' )
  178. {
  179. // One-line comment
  180. // Find the length
  181. size_t n;
  182. for( n = 2; n < sourceLength; n++ )
  183. {
  184. if( source[n] == '\n' )
  185. break;
  186. }
  187. tokenType = ttOnelineComment;
  188. tokenLength = n < sourceLength ? n+1 : n;
  189. return true;
  190. }
  191. if( source[1] == '*' )
  192. {
  193. // Multi-line comment
  194. // Find the length
  195. size_t n;
  196. for( n = 2; n < sourceLength-1; )
  197. {
  198. if( source[n++] == '*' && source[n] == '/' )
  199. break;
  200. }
  201. tokenType = ttMultilineComment;
  202. tokenLength = n+1;
  203. return true;
  204. }
  205. return false;
  206. }
  207. bool asCTokenizer::IsConstant(const char *source, size_t sourceLength, size_t &tokenLength, eTokenType &tokenType) const
  208. {
  209. // Starting with number
  210. if( (source[0] >= '0' && source[0] <= '9') || (source[0] == '.' && sourceLength > 1 && source[1] >= '0' && source[1] <= '9') )
  211. {
  212. // Is it a based number?
  213. if( source[0] == '0' && sourceLength > 1 )
  214. {
  215. // Determine the radix for the constant
  216. int radix = 0;
  217. switch( source[1] )
  218. {
  219. case 'b': case 'B': radix = 2; break;
  220. case 'o': case 'O': radix = 8; break;
  221. case 'd': case 'D': radix = 10; break;
  222. case 'x': case 'X': radix = 16; break;
  223. }
  224. if( radix )
  225. {
  226. size_t n;
  227. for( n = 2; n < sourceLength; n++ )
  228. if( !IsDigitInRadix(source[n], radix) )
  229. break;
  230. tokenType = ttBitsConstant;
  231. tokenLength = n;
  232. return true;
  233. }
  234. }
  235. size_t n;
  236. for( n = 0; n < sourceLength; n++ )
  237. {
  238. if( source[n] < '0' || source[n] > '9' )
  239. break;
  240. }
  241. if( n < sourceLength && (source[n] == '.' || source[n] == 'e' || source[n] == 'E') )
  242. {
  243. if( source[n] == '.' )
  244. {
  245. n++;
  246. for( ; n < sourceLength; n++ )
  247. {
  248. if( source[n] < '0' || source[n] > '9' )
  249. break;
  250. }
  251. }
  252. if( n < sourceLength && (source[n] == 'e' || source[n] == 'E') )
  253. {
  254. n++;
  255. if( n < sourceLength && (source[n] == '-' || source[n] == '+') )
  256. n++;
  257. for( ; n < sourceLength; n++ )
  258. {
  259. if( source[n] < '0' || source[n] > '9' )
  260. break;
  261. }
  262. }
  263. if( n < sourceLength && (source[n] == 'f' || source[n] == 'F') )
  264. {
  265. tokenType = ttFloatConstant;
  266. tokenLength = n + 1;
  267. }
  268. else
  269. {
  270. #ifdef AS_USE_DOUBLE_AS_FLOAT
  271. tokenType = ttFloatConstant;
  272. #else
  273. tokenType = ttDoubleConstant;
  274. #endif
  275. tokenLength = n;
  276. }
  277. return true;
  278. }
  279. tokenType = ttIntConstant;
  280. tokenLength = n;
  281. return true;
  282. }
  283. // String constant between double or single quotes
  284. if( source[0] == '"' || source[0] == '\'' )
  285. {
  286. // Is it a normal string constant or a heredoc string constant?
  287. if( sourceLength >= 6 && source[0] == '"' && source[1] == '"' && source[2] == '"' )
  288. {
  289. // Heredoc string constant (spans multiple lines, no escape sequences)
  290. // Find the length
  291. size_t n;
  292. for( n = 3; n < sourceLength-2; n++ )
  293. {
  294. if( source[n] == '"' && source[n+1] == '"' && source[n+2] == '"' )
  295. break;
  296. }
  297. tokenType = ttHeredocStringConstant;
  298. tokenLength = n+3;
  299. }
  300. else
  301. {
  302. // Normal string constant
  303. tokenType = ttStringConstant;
  304. char quote = source[0];
  305. bool evenSlashes = true;
  306. size_t n;
  307. for( n = 1; n < sourceLength; n++ )
  308. {
  309. #ifdef AS_DOUBLEBYTE_CHARSET
  310. // Double-byte characters are only allowed for ASCII
  311. if( (source[n] & 0x80) && engine->ep.scanner == 0 )
  312. {
  313. // This is a leading character in a double byte character,
  314. // include both in the string and continue processing.
  315. n++;
  316. continue;
  317. }
  318. #endif
  319. if( source[n] == '\n' )
  320. tokenType = ttMultilineStringConstant;
  321. if( source[n] == quote && evenSlashes )
  322. {
  323. tokenLength = n+1;
  324. return true;
  325. }
  326. if( source[n] == '\\' ) evenSlashes = !evenSlashes; else evenSlashes = true;
  327. }
  328. tokenType = ttNonTerminatedStringConstant;
  329. tokenLength = n;
  330. }
  331. return true;
  332. }
  333. return false;
  334. }
  335. bool asCTokenizer::IsIdentifier(const char *source, size_t sourceLength, size_t &tokenLength, eTokenType &tokenType) const
  336. {
  337. // char is unsigned by default on some architectures, e.g. ppc and arm
  338. // Make sure the value is always treated as signed in the below comparisons
  339. signed char c = source[0];
  340. // Starting with letter or underscore
  341. if( (c >= 'a' && c <= 'z') ||
  342. (c >= 'A' && c <= 'Z') ||
  343. c == '_' ||
  344. (c < 0 && engine->ep.allowUnicodeIdentifiers) )
  345. {
  346. tokenType = ttIdentifier;
  347. tokenLength = 1;
  348. for( size_t n = 1; n < sourceLength; n++ )
  349. {
  350. c = source[n];
  351. if( (c >= 'a' && c <= 'z') ||
  352. (c >= 'A' && c <= 'Z') ||
  353. (c >= '0' && c <= '9') ||
  354. c == '_' ||
  355. (c < 0 && engine->ep.allowUnicodeIdentifiers) )
  356. tokenLength++;
  357. else
  358. break;
  359. }
  360. // Make sure the identifier isn't a reserved keyword
  361. if( IsKeyWord(source, tokenLength, tokenLength, tokenType) )
  362. return false;
  363. return true;
  364. }
  365. return false;
  366. }
  367. bool asCTokenizer::IsKeyWord(const char *source, size_t sourceLength, size_t &tokenLength, eTokenType &tokenType) const
  368. {
  369. unsigned char start = source[0];
  370. const sTokenWord **ptr = keywordTable[start];
  371. if( !ptr )
  372. return false;
  373. for( ; *ptr; ++ptr )
  374. {
  375. size_t wlen = (*ptr)->wordLength;
  376. if( sourceLength >= wlen && strncmp(source, (*ptr)->word, wlen) == 0 )
  377. {
  378. // Tokens that end with a character that can be part of an
  379. // identifier require an extra verification to guarantee that
  380. // we don't split an identifier token, e.g. the "!is" token
  381. // and the tokens "!" and "isTrue" in the "!isTrue" expression.
  382. if( wlen < sourceLength &&
  383. ((source[wlen-1] >= 'a' && source[wlen-1] <= 'z') ||
  384. (source[wlen-1] >= 'A' && source[wlen-1] <= 'Z') ||
  385. (source[wlen-1] >= '0' && source[wlen-1] <= '9')) &&
  386. ((source[wlen] >= 'a' && source[wlen] <= 'z') ||
  387. (source[wlen] >= 'A' && source[wlen] <= 'Z') ||
  388. (source[wlen] >= '0' && source[wlen] <= '9') ||
  389. (source[wlen] == '_')) )
  390. {
  391. // The token doesn't really match, even though
  392. // the start of the source matches the token
  393. continue;
  394. }
  395. tokenType = (*ptr)->tokenType;
  396. tokenLength = wlen;
  397. return true;
  398. }
  399. }
  400. return false;
  401. }
  402. END_AS_NAMESPACE