YarrPattern.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. /*
  2. * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
  3. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  15. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
  18. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #ifndef YarrPattern_h
  27. #define YarrPattern_h
  28. #include <wtf/CheckedArithmetic.h>
  29. #include <wtf/OwnPtr.h>
  30. #include <wtf/PassOwnPtr.h>
  31. #include <wtf/RefCounted.h>
  32. #include <wtf/Vector.h>
  33. #include <wtf/text/WTFString.h>
  34. #include <wtf/unicode/Unicode.h>
  35. namespace JSC { namespace Yarr {
  36. struct PatternDisjunction;
  37. struct CharacterRange {
  38. UChar begin;
  39. UChar end;
  40. CharacterRange(UChar begin, UChar end)
  41. : begin(begin)
  42. , end(end)
  43. {
  44. }
  45. };
  46. struct CharacterClass {
  47. #if ENABLE(JIT) && ENABLE(DETACHED_JIT)
  48. DETACHED_JIT_MAKE_SHARED_DATA_ALLOCATED;
  49. #else
  50. WTF_MAKE_FAST_ALLOCATED;
  51. #endif
  52. public:
  53. // All CharacterClass instances have to have the full set of matches and ranges,
  54. // they may have an optional m_table for faster lookups (which must match the
  55. // specified matches and ranges)
  56. CharacterClass()
  57. : m_table(0)
  58. {
  59. }
  60. CharacterClass(const char* table, bool inverted)
  61. : m_table(table)
  62. , m_tableInverted(inverted)
  63. {
  64. }
  65. Vector_shared<UChar> m_matches;
  66. Vector_shared<CharacterRange> m_ranges;
  67. Vector_shared<UChar> m_matchesUnicode;
  68. Vector_shared<CharacterRange> m_rangesUnicode;
  69. const char* m_table;
  70. bool m_tableInverted;
  71. };
  72. enum QuantifierType {
  73. QuantifierFixedCount,
  74. QuantifierGreedy,
  75. QuantifierNonGreedy,
  76. };
  77. struct PatternTerm {
  78. enum Type {
  79. TypeAssertionBOL,
  80. TypeAssertionEOL,
  81. TypeAssertionWordBoundary,
  82. TypePatternCharacter,
  83. TypeCharacterClass,
  84. TypeBackReference,
  85. TypeForwardReference,
  86. TypeParenthesesSubpattern,
  87. TypeParentheticalAssertion,
  88. TypeDotStarEnclosure,
  89. } type;
  90. bool m_capture :1;
  91. bool m_invert :1;
  92. union {
  93. UChar patternCharacter;
  94. CharacterClass* characterClass;
  95. unsigned backReferenceSubpatternId;
  96. struct {
  97. PatternDisjunction* disjunction;
  98. unsigned subpatternId;
  99. unsigned lastSubpatternId;
  100. bool isCopy;
  101. bool isTerminal;
  102. } parentheses;
  103. struct {
  104. bool bolAnchor : 1;
  105. bool eolAnchor : 1;
  106. } anchors;
  107. };
  108. QuantifierType quantityType;
  109. Checked<unsigned> quantityCount;
  110. int inputPosition;
  111. unsigned frameLocation;
  112. PatternTerm(UChar ch)
  113. : type(PatternTerm::TypePatternCharacter)
  114. , m_capture(false)
  115. , m_invert(false)
  116. {
  117. patternCharacter = ch;
  118. quantityType = QuantifierFixedCount;
  119. quantityCount = 1;
  120. }
  121. PatternTerm(CharacterClass* charClass, bool invert)
  122. : type(PatternTerm::TypeCharacterClass)
  123. , m_capture(false)
  124. , m_invert(invert)
  125. {
  126. characterClass = charClass;
  127. quantityType = QuantifierFixedCount;
  128. quantityCount = 1;
  129. }
  130. PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
  131. : type(type)
  132. , m_capture(capture)
  133. , m_invert(invert)
  134. {
  135. parentheses.disjunction = disjunction;
  136. parentheses.subpatternId = subpatternId;
  137. parentheses.isCopy = false;
  138. parentheses.isTerminal = false;
  139. quantityType = QuantifierFixedCount;
  140. quantityCount = 1;
  141. }
  142. PatternTerm(Type type, bool invert = false)
  143. : type(type)
  144. , m_capture(false)
  145. , m_invert(invert)
  146. {
  147. quantityType = QuantifierFixedCount;
  148. quantityCount = 1;
  149. }
  150. PatternTerm(unsigned spatternId)
  151. : type(TypeBackReference)
  152. , m_capture(false)
  153. , m_invert(false)
  154. {
  155. backReferenceSubpatternId = spatternId;
  156. quantityType = QuantifierFixedCount;
  157. quantityCount = 1;
  158. }
  159. PatternTerm(bool bolAnchor, bool eolAnchor)
  160. : type(TypeDotStarEnclosure)
  161. , m_capture(false)
  162. , m_invert(false)
  163. {
  164. anchors.bolAnchor = bolAnchor;
  165. anchors.eolAnchor = eolAnchor;
  166. quantityType = QuantifierFixedCount;
  167. quantityCount = 1;
  168. }
  169. static PatternTerm ForwardReference()
  170. {
  171. return PatternTerm(TypeForwardReference);
  172. }
  173. static PatternTerm BOL()
  174. {
  175. return PatternTerm(TypeAssertionBOL);
  176. }
  177. static PatternTerm EOL()
  178. {
  179. return PatternTerm(TypeAssertionEOL);
  180. }
  181. static PatternTerm WordBoundary(bool invert)
  182. {
  183. return PatternTerm(TypeAssertionWordBoundary, invert);
  184. }
  185. bool invert()
  186. {
  187. return m_invert;
  188. }
  189. bool capture()
  190. {
  191. return m_capture;
  192. }
  193. void quantify(unsigned count, QuantifierType type)
  194. {
  195. quantityCount = count;
  196. quantityType = type;
  197. }
  198. };
  199. struct PatternAlternative {
  200. #if ENABLE(DETACHED_JIT)
  201. DETACHED_JIT_MAKE_SHARED_DATA_ALLOCATED;
  202. #else
  203. WTF_MAKE_FAST_ALLOCATED;
  204. #endif
  205. public:
  206. PatternAlternative(PatternDisjunction* disjunction)
  207. : m_parent(disjunction)
  208. , m_onceThrough(false)
  209. , m_hasFixedSize(false)
  210. , m_startsWithBOL(false)
  211. , m_containsBOL(false)
  212. {
  213. }
  214. PatternTerm& lastTerm()
  215. {
  216. ASSERT(m_terms.size());
  217. return m_terms[m_terms.size() - 1];
  218. }
  219. void removeLastTerm()
  220. {
  221. ASSERT(m_terms.size());
  222. m_terms.shrink(m_terms.size() - 1);
  223. }
  224. void setOnceThrough()
  225. {
  226. m_onceThrough = true;
  227. }
  228. bool onceThrough()
  229. {
  230. return m_onceThrough;
  231. }
  232. Vector_shared<PatternTerm> m_terms;
  233. PatternDisjunction* m_parent;
  234. unsigned m_minimumSize;
  235. bool m_onceThrough : 1;
  236. bool m_hasFixedSize : 1;
  237. bool m_startsWithBOL : 1;
  238. bool m_containsBOL : 1;
  239. };
  240. struct PatternDisjunction {
  241. #if ENABLE(DETACHED_JIT)
  242. DETACHED_JIT_MAKE_SHARED_DATA_ALLOCATED;
  243. #else
  244. WTF_MAKE_FAST_ALLOCATED;
  245. #endif
  246. public:
  247. PatternDisjunction(PatternAlternative* parent = 0)
  248. : m_parent(parent)
  249. , m_hasFixedSize(false)
  250. {
  251. }
  252. PatternAlternative* addNewAlternative()
  253. {
  254. PatternAlternative* alternative = new PatternAlternative(this);
  255. m_alternatives.append(adoptPtr(alternative));
  256. return alternative;
  257. }
  258. Vector_shared<OwnPtr<PatternAlternative> > m_alternatives;
  259. PatternAlternative* m_parent;
  260. unsigned m_minimumSize;
  261. unsigned m_callFrameSize;
  262. bool m_hasFixedSize;
  263. };
  264. // You probably don't want to be calling these functions directly
  265. // (please to be calling newlineCharacterClass() et al on your
  266. // friendly neighborhood YarrPattern instance to get nicely
  267. // cached copies).
  268. CharacterClass* newlineCreate();
  269. CharacterClass* digitsCreate();
  270. CharacterClass* spacesCreate();
  271. CharacterClass* wordcharCreate();
  272. CharacterClass* nondigitsCreate();
  273. CharacterClass* nonspacesCreate();
  274. CharacterClass* nonwordcharCreate();
  275. struct TermChain {
  276. TermChain(PatternTerm term)
  277. : term(term)
  278. {}
  279. PatternTerm term;
  280. Vector<TermChain> hotTerms;
  281. };
  282. struct YarrPattern {
  283. #if ENABLE(DETACHED_JIT)
  284. DETACHED_JIT_MAKE_SHARED_DATA_ALLOCATED;
  285. public:
  286. #endif
  287. JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error);
  288. void reset()
  289. {
  290. m_numSubpatterns = 0;
  291. m_maxBackReference = 0;
  292. m_containsBackreferences = false;
  293. m_containsBOL = false;
  294. newlineCached = 0;
  295. digitsCached = 0;
  296. spacesCached = 0;
  297. wordcharCached = 0;
  298. nondigitsCached = 0;
  299. nonspacesCached = 0;
  300. nonwordcharCached = 0;
  301. m_disjunctions.clear();
  302. m_userCharacterClasses.clear();
  303. }
  304. bool containsIllegalBackReference()
  305. {
  306. return m_maxBackReference > m_numSubpatterns;
  307. }
  308. CharacterClass* newlineCharacterClass()
  309. {
  310. if (!newlineCached)
  311. m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate()));
  312. return newlineCached;
  313. }
  314. CharacterClass* digitsCharacterClass()
  315. {
  316. if (!digitsCached)
  317. m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate()));
  318. return digitsCached;
  319. }
  320. CharacterClass* spacesCharacterClass()
  321. {
  322. if (!spacesCached)
  323. m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate()));
  324. return spacesCached;
  325. }
  326. CharacterClass* wordcharCharacterClass()
  327. {
  328. if (!wordcharCached)
  329. m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate()));
  330. return wordcharCached;
  331. }
  332. CharacterClass* nondigitsCharacterClass()
  333. {
  334. if (!nondigitsCached)
  335. m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate()));
  336. return nondigitsCached;
  337. }
  338. CharacterClass* nonspacesCharacterClass()
  339. {
  340. if (!nonspacesCached)
  341. m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate()));
  342. return nonspacesCached;
  343. }
  344. CharacterClass* nonwordcharCharacterClass()
  345. {
  346. if (!nonwordcharCached)
  347. m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate()));
  348. return nonwordcharCached;
  349. }
  350. bool m_ignoreCase : 1;
  351. bool m_multiline : 1;
  352. bool m_containsBackreferences : 1;
  353. bool m_containsBOL : 1;
  354. unsigned m_numSubpatterns;
  355. unsigned m_maxBackReference;
  356. PatternDisjunction* m_body;
  357. Vector_shared<OwnPtr<PatternDisjunction>, 4> m_disjunctions;
  358. Vector_shared<OwnPtr<CharacterClass> > m_userCharacterClasses;
  359. private:
  360. const char* compile(const String& patternString);
  361. CharacterClass* newlineCached;
  362. CharacterClass* digitsCached;
  363. CharacterClass* spacesCached;
  364. CharacterClass* wordcharCached;
  365. CharacterClass* nondigitsCached;
  366. CharacterClass* nonspacesCached;
  367. CharacterClass* nonwordcharCached;
  368. };
  369. } } // namespace JSC::Yarr
  370. #endif // YarrPattern_h