Lexer.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. /*
  2. * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
  3. * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
  4. * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Library General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Library General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Library General Public License
  17. * along with this library; see the file COPYING.LIB. If not, write to
  18. * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19. * Boston, MA 02110-1301, USA.
  20. *
  21. */
  22. #ifndef Lexer_h
  23. #define Lexer_h
  24. #include "Lookup.h"
  25. #include "ParserArena.h"
  26. #include "ParserTokens.h"
  27. #include "SourceCode.h"
  28. #include <wtf/ASCIICType.h>
  29. #include <wtf/SegmentedVector.h>
  30. #include <wtf/Vector.h>
  31. #include <wtf/unicode/Unicode.h>
  32. namespace JSC {
  33. class Keywords {
  34. public:
  35. bool isKeyword(const Identifier& ident) const
  36. {
  37. return m_keywordTable.entry(m_vm, ident);
  38. }
  39. const HashEntry* getKeyword(const Identifier& ident) const
  40. {
  41. return m_keywordTable.entry(m_vm, ident);
  42. }
  43. ~Keywords()
  44. {
  45. m_keywordTable.deleteTable();
  46. }
  47. private:
  48. friend class VM;
  49. Keywords(VM*);
  50. VM* m_vm;
  51. const HashTable m_keywordTable;
  52. };
  53. enum LexerFlags {
  54. LexerFlagsIgnoreReservedWords = 1,
  55. LexerFlagsDontBuildStrings = 2,
  56. LexexFlagsDontBuildKeywords = 4
  57. };
  58. template <typename T>
  59. class Lexer {
  60. WTF_MAKE_NONCOPYABLE(Lexer);
  61. WTF_MAKE_FAST_ALLOCATED;
  62. public:
  63. Lexer(VM*);
  64. ~Lexer();
  65. // Character manipulation functions.
  66. static bool isWhiteSpace(T character);
  67. static bool isLineTerminator(T character);
  68. static unsigned char convertHex(int c1, int c2);
  69. static UChar convertUnicode(int c1, int c2, int c3, int c4);
  70. // Functions to set up parsing.
  71. void setCode(const SourceCode&, ParserArena*);
  72. void setIsReparsing() { m_isReparsing = true; }
  73. bool isReparsing() const { return m_isReparsing; }
  74. JSTokenType lex(JSTokenData*, JSTokenLocation*, unsigned, bool strictMode);
  75. bool nextTokenIsColon();
  76. int lineNumber() const { return m_lineNumber; }
  77. ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
  78. ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
  79. void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
  80. int lastLineNumber() const { return m_lastLineNumber; }
  81. bool prevTerminator() const { return m_terminator; }
  82. SourceCode sourceCode(int openBrace, int closeBrace, int firstLine, unsigned startColumn);
  83. bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
  84. bool skipRegExp();
  85. // Functions for use after parsing.
  86. bool sawError() const { return m_error; }
  87. String getErrorMessage() const { return m_lexErrorMessage; }
  88. void clear();
  89. void setOffset(int offset, int lineStartOffset)
  90. {
  91. m_error = 0;
  92. m_lexErrorMessage = String();
  93. m_code = sourcePtrFromOffset(offset);
  94. m_lineStart = sourcePtrFromOffset(lineStartOffset);
  95. ASSERT(currentOffset() >= currentLineStartOffset());
  96. m_buffer8.resize(0);
  97. m_buffer16.resize(0);
  98. if (LIKELY(m_code < m_codeEnd))
  99. m_current = *m_code;
  100. else
  101. m_current = 0;
  102. }
  103. void setLineNumber(int line)
  104. {
  105. m_lineNumber = line;
  106. }
  107. SourceProvider* sourceProvider() const { return m_source->provider(); }
  108. JSTokenType lexExpectIdentifier(JSTokenData*, JSTokenLocation*, unsigned, bool strictMode);
  109. private:
  110. void record8(int);
  111. void append8(const T*, size_t);
  112. void record16(int);
  113. void record16(T);
  114. void append16(const LChar*, size_t);
  115. void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
  116. ALWAYS_INLINE void shift();
  117. ALWAYS_INLINE bool atEnd() const;
  118. ALWAYS_INLINE T peek(int offset) const;
  119. struct UnicodeHexValue {
  120. enum ValueType { ValidHex, IncompleteHex, InvalidHex };
  121. explicit UnicodeHexValue(int value)
  122. : m_value(value)
  123. {
  124. }
  125. explicit UnicodeHexValue(ValueType type)
  126. : m_value(type == IncompleteHex ? -2 : -1)
  127. {
  128. }
  129. ValueType valueType() const
  130. {
  131. if (m_value >= 0)
  132. return ValidHex;
  133. return m_value == -2 ? IncompleteHex : InvalidHex;
  134. }
  135. bool isValid() const { return m_value >= 0; }
  136. int value() const
  137. {
  138. ASSERT(m_value >= 0);
  139. return m_value;
  140. }
  141. private:
  142. int m_value;
  143. };
  144. UnicodeHexValue parseFourDigitUnicodeHex();
  145. void shiftLineTerminator();
  146. ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
  147. ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
  148. String invalidCharacterMessage() const;
  149. ALWAYS_INLINE const T* currentSourcePtr() const;
  150. ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
  151. ALWAYS_INLINE void setCodeStart(const StringImpl*);
  152. ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
  153. ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
  154. ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
  155. ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
  156. ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
  157. ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
  158. ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
  159. template <int shiftAmount> void internalShift();
  160. template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
  161. template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
  162. template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
  163. enum StringParseResult {
  164. StringParsedSuccessfully,
  165. StringUnterminated,
  166. StringCannotBeParsed
  167. };
  168. template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
  169. template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
  170. ALWAYS_INLINE void parseHex(double& returnValue);
  171. ALWAYS_INLINE bool parseOctal(double& returnValue);
  172. ALWAYS_INLINE bool parseDecimal(double& returnValue);
  173. ALWAYS_INLINE void parseNumberAfterDecimalPoint();
  174. ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
  175. ALWAYS_INLINE bool parseMultilineComment();
  176. static const size_t initialReadBufferCapacity = 32;
  177. int m_lineNumber;
  178. int m_lastLineNumber;
  179. Vector<LChar> m_buffer8;
  180. Vector<UChar> m_buffer16;
  181. bool m_terminator;
  182. int m_lastToken;
  183. const SourceCode* m_source;
  184. unsigned m_sourceOffset;
  185. const T* m_code;
  186. const T* m_codeStart;
  187. const T* m_codeEnd;
  188. const T* m_codeStartPlusOffset;
  189. const T* m_lineStart;
  190. bool m_isReparsing;
  191. bool m_atLineStart;
  192. bool m_error;
  193. String m_lexErrorMessage;
  194. T m_current;
  195. IdentifierArena* m_arena;
  196. VM* m_vm;
  197. };
  198. template <>
  199. ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
  200. {
  201. return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
  202. }
  203. template <>
  204. ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
  205. {
  206. return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (WTF::Unicode::isSeparatorSpace(ch) || ch == 0xFEFF);
  207. }
  208. template <>
  209. ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
  210. {
  211. return ch == '\r' || ch == '\n';
  212. }
  213. template <>
  214. ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
  215. {
  216. return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
  217. }
  218. template <typename T>
  219. inline unsigned char Lexer<T>::convertHex(int c1, int c2)
  220. {
  221. return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
  222. }
  223. template <typename T>
  224. inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
  225. {
  226. return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
  227. }
  228. template <typename T>
  229. ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
  230. {
  231. return &m_arena->makeIdentifier(m_vm, characters, length);
  232. }
  233. template <typename T>
  234. ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
  235. {
  236. return &m_arena->makeIdentifier(m_vm, characters, length);
  237. }
  238. template <>
  239. ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
  240. {
  241. return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
  242. }
  243. template <>
  244. ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
  245. {
  246. if (!(orAllChars & ~0xff))
  247. return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
  248. return &m_arena->makeIdentifier(m_vm, characters, length);
  249. }
  250. template <>
  251. ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
  252. {
  253. ASSERT(sourceString->is8Bit());
  254. m_codeStart = sourceString->characters8();
  255. }
  256. template <>
  257. ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
  258. {
  259. ASSERT(!sourceString->is8Bit());
  260. m_codeStart = sourceString->characters16();
  261. }
  262. template <typename T>
  263. ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
  264. {
  265. return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
  266. }
  267. template <typename T>
  268. ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
  269. {
  270. return &m_arena->makeIdentifier(m_vm, characters, length);
  271. }
  272. template <typename T>
  273. ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
  274. {
  275. return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
  276. }
  277. template <typename T>
  278. ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSTokenData* tokenData, JSTokenLocation* tokenLocation, unsigned lexerFlags, bool strictMode)
  279. {
  280. ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
  281. const T* start = m_code;
  282. const T* ptr = start;
  283. const T* end = m_codeEnd;
  284. if (ptr >= end) {
  285. ASSERT(ptr == end);
  286. goto slowCase;
  287. }
  288. if (!WTF::isASCIIAlpha(*ptr))
  289. goto slowCase;
  290. ++ptr;
  291. while (ptr < end) {
  292. if (!WTF::isASCIIAlphanumeric(*ptr))
  293. break;
  294. ++ptr;
  295. }
  296. // Here's the shift
  297. if (ptr < end) {
  298. if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
  299. goto slowCase;
  300. m_current = *ptr;
  301. } else
  302. m_current = 0;
  303. m_code = ptr;
  304. ASSERT(currentOffset() >= currentLineStartOffset());
  305. // Create the identifier if needed
  306. if (lexerFlags & LexexFlagsDontBuildKeywords)
  307. tokenData->ident = 0;
  308. else
  309. tokenData->ident = makeLCharIdentifier(start, ptr - start);
  310. tokenLocation->line = m_lineNumber;
  311. tokenLocation->lineStartOffset = currentLineStartOffset();
  312. tokenLocation->startOffset = offsetFromSourcePtr(start);
  313. tokenLocation->endOffset = currentOffset();
  314. ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
  315. m_lastToken = IDENT;
  316. return IDENT;
  317. slowCase:
  318. return lex(tokenData, tokenLocation, lexerFlags, strictMode);
  319. }
  320. } // namespace JSC
  321. #endif // Lexer_h