TokenizerTest.php 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982
  1. <?php
  2. namespace Masterminds\HTML5\Tests\Parser;
  3. use Masterminds\HTML5\Parser\UTF8Utils;
  4. use Masterminds\HTML5\Parser\Scanner;
  5. use Masterminds\HTML5\Parser\Tokenizer;
  6. class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase
  7. {
  8. // ================================================================
  9. // Additional assertions.
  10. // ================================================================
  11. /**
  12. * Tests that an event matches both the event type and the expected value.
  13. *
  14. * @param string $type
  15. * Expected event type
  16. * @param string $expects
  17. * The value expected in $event['data'][0]
  18. */
  19. public function assertEventEquals($type, $expects, $event)
  20. {
  21. $this->assertEquals($type, $event['name'], "Event $type for " . print_r($event, true));
  22. if (is_array($expects)) {
  23. $this->assertEquals($expects, $event['data'], "Event $type should equal " . print_r($expects, true) . ': ' . print_r($event, true));
  24. } else {
  25. $d = (is_array($event['data']) ? $event['data'][0] : null);
  26. $this->assertEquals($expects, $d, "Event $type should equal $expects: " . print_r($event, true));
  27. }
  28. }
  29. /**
  30. * Assert that a given event is 'error'.
  31. */
  32. public function assertEventError($event)
  33. {
  34. $this->assertEquals('error', $event['name'], 'Expected error for event: ' . print_r($event, true));
  35. }
  36. /**
  37. * Asserts that all of the tests are good.
  38. *
  39. * This loops through a map of tests/expectations and runs a few assertions on each test.
  40. *
  41. * Checks:
  42. * - depth (if depth is > 0)
  43. * - event name
  44. * - matches on event 0.
  45. */
  46. protected function isAllGood($name, $depth, $tests, $debug = false)
  47. {
  48. foreach ($tests as $try => $expects) {
  49. if ($debug) {
  50. fprintf(STDOUT, "%s expects %s\n", $try, print_r($expects, true));
  51. }
  52. $e = $this->parse($try);
  53. if ($depth > 0) {
  54. $this->assertEquals($depth, $e->depth(), "Expected depth $depth for test $try." . print_r($e, true));
  55. }
  56. $this->assertEventEquals($name, $expects, $e->get(0));
  57. }
  58. }
  59. // ================================================================
  60. // Utility functions.
  61. // ================================================================
  62. public function testParse()
  63. {
  64. list($tok, $events) = $this->createTokenizer('');
  65. $tok->parse();
  66. $e1 = $events->get(0);
  67. $this->assertEquals(1, $events->Depth());
  68. $this->assertEquals('eof', $e1['name']);
  69. }
  70. public function testWhitespace()
  71. {
  72. $spaces = ' ';
  73. list($tok, $events) = $this->createTokenizer($spaces);
  74. $tok->parse();
  75. $this->assertEquals(2, $events->depth());
  76. $e1 = $events->get(0);
  77. $this->assertEquals('text', $e1['name']);
  78. $this->assertEquals($spaces, $e1['data'][0]);
  79. }
  80. public function testCharacterReference()
  81. {
  82. $good = array(
  83. '&amp;' => '&',
  84. '&#x0003c;' => '<',
  85. '&#38;' => '&',
  86. '&' => '&',
  87. );
  88. $this->isAllGood('text', 2, $good);
  89. // Test with broken charref
  90. $str = '&foo';
  91. $events = $this->parse($str);
  92. $e1 = $events->get(0);
  93. $this->assertEquals('error', $e1['name']);
  94. $str = '&#xfoo';
  95. $events = $this->parse($str);
  96. $e1 = $events->get(0);
  97. $this->assertEquals('error', $e1['name']);
  98. $str = '&#foo';
  99. $events = $this->parse($str);
  100. $e1 = $events->get(0);
  101. $this->assertEquals('error', $e1['name']);
  102. // FIXME: Once the text processor is done, need to verify that the
  103. // tokens are transformed correctly into text.
  104. }
  105. public function testBogusComment()
  106. {
  107. $bogus = array(
  108. '</+this is a bogus comment. +>',
  109. '<!+this is a bogus comment. !>',
  110. '<!D OCTYPE foo bar>',
  111. '<!DOCTYEP foo bar>',
  112. '<![CADATA[ TEST ]]>',
  113. '<![CDATA Hello ]]>',
  114. '<![CDATA[ Hello [[>',
  115. '<!CDATA[[ test ]]>',
  116. '<![CDATA[',
  117. '<![CDATA[hellooooo hello',
  118. '<? Hello World ?>',
  119. '<? Hello World',
  120. );
  121. foreach ($bogus as $str) {
  122. $events = $this->parse($str);
  123. $this->assertEventError($events->get(0));
  124. $this->assertEventEquals('comment', $str, $events->get(1));
  125. }
  126. }
  127. public function testEndTag()
  128. {
  129. $succeed = array(
  130. '</a>' => 'a',
  131. '</test>' => 'test',
  132. '</test
  133. >' => 'test',
  134. '</thisIsTheTagThatDoesntEndItJustGoesOnAndOnMyFriend>' => 'thisisthetagthatdoesntenditjustgoesonandonmyfriend',
  135. // See 8.2.4.10, which requires this and does not say error.
  136. '</a<b>' => 'a<b',
  137. );
  138. $this->isAllGood('endTag', 2, $succeed);
  139. // Recoverable failures
  140. $fail = array(
  141. '</a class="monkey">' => 'a',
  142. '</a <b>' => 'a',
  143. '</a <b <c>' => 'a',
  144. '</a is the loneliest letter>' => 'a',
  145. '</a' => 'a',
  146. );
  147. foreach ($fail as $test => $result) {
  148. $events = $this->parse($test);
  149. $this->assertEquals(3, $events->depth());
  150. // Should have triggered an error.
  151. $this->assertEventError($events->get(0));
  152. // Should have tried to parse anyway.
  153. $this->assertEventEquals('endTag', $result, $events->get(1));
  154. }
  155. // BogoComments
  156. $comments = array(
  157. '</>' => '</>',
  158. '</ >' => '</ >',
  159. '</ a>' => '</ a>',
  160. );
  161. foreach ($comments as $test => $result) {
  162. $events = $this->parse($test);
  163. $this->assertEquals(3, $events->depth());
  164. // Should have triggered an error.
  165. $this->assertEventError($events->get(0));
  166. // Should have tried to parse anyway.
  167. $this->assertEventEquals('comment', $result, $events->get(1));
  168. }
  169. }
  170. public function testComment()
  171. {
  172. $good = array(
  173. '<!--easy-->' => 'easy',
  174. '<!-- 1 > 0 -->' => ' 1 > 0 ',
  175. '<!-- --$i -->' => ' --$i ',
  176. '<!----$i-->' => '--$i',
  177. "<!--\nHello World.\na-->" => "\nHello World.\na",
  178. '<!-- <!-- -->' => ' <!-- ',
  179. );
  180. foreach ($good as $test => $expected) {
  181. $events = $this->parse($test);
  182. $this->assertEventEquals('comment', $expected, $events->get(0));
  183. }
  184. $fail = array(
  185. '<!-->' => '',
  186. '<!--Hello' => 'Hello',
  187. "<!--\0Hello" => UTF8Utils::FFFD . 'Hello',
  188. '<!--' => '',
  189. );
  190. foreach ($fail as $test => $expected) {
  191. $events = $this->parse($test);
  192. $this->assertEquals(3, $events->depth());
  193. $this->assertEventError($events->get(0));
  194. $this->assertEventEquals('comment', $expected, $events->get(1));
  195. }
  196. }
  197. public function testCDATASection()
  198. {
  199. $good = array(
  200. '<![CDATA[ This is a test. ]]>' => ' This is a test. ',
  201. '<![CDATA[CDATA]]>' => 'CDATA',
  202. '<![CDATA[ ]] > ]]>' => ' ]] > ',
  203. '<![CDATA[ ]]>' => ' ',
  204. );
  205. $this->isAllGood('cdata', 2, $good);
  206. }
  207. public function testDoctype()
  208. {
  209. $good = array(
  210. '<!DOCTYPE html>' => array(
  211. 'html',
  212. 0,
  213. null,
  214. false,
  215. ),
  216. '<!doctype html>' => array(
  217. 'html',
  218. 0,
  219. null,
  220. false,
  221. ),
  222. '<!DocType html>' => array(
  223. 'html',
  224. 0,
  225. null,
  226. false,
  227. ),
  228. "<!DOCTYPE\nhtml>" => array(
  229. 'html',
  230. 0,
  231. null,
  232. false,
  233. ),
  234. "<!DOCTYPE\fhtml>" => array(
  235. 'html',
  236. 0,
  237. null,
  238. false,
  239. ),
  240. '<!DOCTYPE html PUBLIC "foo bar">' => array(
  241. 'html',
  242. EventStack::DOCTYPE_PUBLIC,
  243. 'foo bar',
  244. false,
  245. ),
  246. "<!DOCTYPE html PUBLIC 'foo bar'>" => array(
  247. 'html',
  248. EventStack::DOCTYPE_PUBLIC,
  249. 'foo bar',
  250. false,
  251. ),
  252. '<!DOCTYPE html PUBLIC "foo bar" >' => array(
  253. 'html',
  254. EventStack::DOCTYPE_PUBLIC,
  255. 'foo bar',
  256. false,
  257. ),
  258. "<!DOCTYPE html \nPUBLIC\n'foo bar'>" => array(
  259. 'html',
  260. EventStack::DOCTYPE_PUBLIC,
  261. 'foo bar',
  262. false,
  263. ),
  264. '<!DOCTYPE html SYSTEM "foo bar">' => array(
  265. 'html',
  266. EventStack::DOCTYPE_SYSTEM,
  267. 'foo bar',
  268. false,
  269. ),
  270. "<!DOCTYPE html SYSTEM 'foo bar'>" => array(
  271. 'html',
  272. EventStack::DOCTYPE_SYSTEM,
  273. 'foo bar',
  274. false,
  275. ),
  276. '<!DOCTYPE html SYSTEM "foo/bar" >' => array(
  277. 'html',
  278. EventStack::DOCTYPE_SYSTEM,
  279. 'foo/bar',
  280. false,
  281. ),
  282. "<!DOCTYPE html \nSYSTEM\n'foo bar'>" => array(
  283. 'html',
  284. EventStack::DOCTYPE_SYSTEM,
  285. 'foo bar',
  286. false,
  287. ),
  288. );
  289. $this->isAllGood('doctype', 2, $good);
  290. $bad = array(
  291. '<!DOCTYPE>' => array(
  292. null,
  293. EventStack::DOCTYPE_NONE,
  294. null,
  295. true,
  296. ),
  297. '<!DOCTYPE >' => array(
  298. null,
  299. EventStack::DOCTYPE_NONE,
  300. null,
  301. true,
  302. ),
  303. '<!DOCTYPE foo' => array(
  304. 'foo',
  305. EventStack::DOCTYPE_NONE,
  306. null,
  307. true,
  308. ),
  309. '<!DOCTYPE foo PUB' => array(
  310. 'foo',
  311. EventStack::DOCTYPE_NONE,
  312. null,
  313. true,
  314. ),
  315. '<!DOCTYPE foo PUB>' => array(
  316. 'foo',
  317. EventStack::DOCTYPE_NONE,
  318. null,
  319. true,
  320. ),
  321. '<!DOCTYPE foo PUB "Looks good">' => array(
  322. 'foo',
  323. EventStack::DOCTYPE_NONE,
  324. null,
  325. true,
  326. ),
  327. '<!DOCTYPE foo SYSTME "Looks good"' => array(
  328. 'foo',
  329. EventStack::DOCTYPE_NONE,
  330. null,
  331. true,
  332. ),
  333. // Can't tell whether these are ids or ID types, since the context is chopped.
  334. '<!DOCTYPE foo PUBLIC' => array(
  335. 'foo',
  336. EventStack::DOCTYPE_NONE,
  337. null,
  338. true,
  339. ),
  340. '<!DOCTYPE foo PUBLIC>' => array(
  341. 'foo',
  342. EventStack::DOCTYPE_NONE,
  343. null,
  344. true,
  345. ),
  346. '<!DOCTYPE foo SYSTEM' => array(
  347. 'foo',
  348. EventStack::DOCTYPE_NONE,
  349. null,
  350. true,
  351. ),
  352. '<!DOCTYPE foo SYSTEM>' => array(
  353. 'foo',
  354. EventStack::DOCTYPE_NONE,
  355. null,
  356. true,
  357. ),
  358. '<!DOCTYPE html SYSTEM "foo bar"' => array(
  359. 'html',
  360. EventStack::DOCTYPE_SYSTEM,
  361. 'foo bar',
  362. true,
  363. ),
  364. '<!DOCTYPE html SYSTEM "foo bar" more stuff>' => array(
  365. 'html',
  366. EventStack::DOCTYPE_SYSTEM,
  367. 'foo bar',
  368. true,
  369. ),
  370. );
  371. foreach ($bad as $test => $expects) {
  372. $events = $this->parse($test);
  373. // fprintf(STDOUT, $test . PHP_EOL);
  374. $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  375. $this->assertEventError($events->get(0));
  376. $this->assertEventEquals('doctype', $expects, $events->get(1));
  377. }
  378. }
  379. public function testProcessorInstruction()
  380. {
  381. $good = array(
  382. '<?hph ?>' => 'hph',
  383. '<?hph echo "Hello World"; ?>' => array(
  384. 'hph',
  385. 'echo "Hello World"; ',
  386. ),
  387. "<?hph \necho 'Hello World';\n?>" => array(
  388. 'hph',
  389. "echo 'Hello World';\n",
  390. ),
  391. );
  392. $this->isAllGood('pi', 2, $good);
  393. }
  394. /**
  395. * This tests just simple tags.
  396. */
  397. public function testSimpleTags()
  398. {
  399. $open = array(
  400. '<foo>' => 'foo',
  401. '<FOO>' => 'foo',
  402. '<fOO>' => 'foo',
  403. '<foo >' => 'foo',
  404. "<foo\n\n\n\n>" => 'foo',
  405. '<foo:bar>' => 'foo:bar',
  406. );
  407. $this->isAllGood('startTag', 2, $open);
  408. $selfClose = array(
  409. '<foo/>' => 'foo',
  410. '<FOO/>' => 'foo',
  411. '<foo />' => 'foo',
  412. "<foo\n\n\n\n/>" => 'foo',
  413. '<foo:bar/>' => 'foo:bar',
  414. );
  415. foreach ($selfClose as $test => $expects) {
  416. $events = $this->parse($test);
  417. $this->assertEquals(2, $events->depth(), "Counting events for '$test'" . print_r($events, true));
  418. $this->assertEventEquals('startTag', $expects, $events->get(0));
  419. $event = $events->get(0);
  420. $this->assertTrue($event['data'][2]);
  421. }
  422. $bad = array(
  423. '<foo' => 'foo',
  424. '<foo ' => 'foo',
  425. '<foo/' => 'foo',
  426. '<foo /' => 'foo',
  427. );
  428. foreach ($bad as $test => $expects) {
  429. $events = $this->parse($test);
  430. $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  431. $this->assertEventError($events->get(0));
  432. $this->assertEventEquals('startTag', $expects, $events->get(1));
  433. }
  434. }
  435. public function testTagsWithAttributeAndMissingName()
  436. {
  437. $cases = array(
  438. '<id="top_featured">' => 'id',
  439. '<color="white">' => 'color',
  440. "<class='neaktivni_stranka'>" => 'class',
  441. '<bgcolor="white">' => 'bgcolor',
  442. '<class="nom">' => 'class',
  443. );
  444. foreach ($cases as $html => $expected) {
  445. $events = $this->parse($html);
  446. $this->assertEventError($events->get(0));
  447. $this->assertEventError($events->get(1));
  448. $this->assertEventError($events->get(2));
  449. $this->assertEventEquals('startTag', $expected, $events->get(3));
  450. $this->assertEventEquals('eof', null, $events->get(4));
  451. }
  452. }
  453. public function testTagNotClosedAfterTagName()
  454. {
  455. $cases = array(
  456. '<noscript<img>' => array(
  457. 'noscript',
  458. 'img',
  459. ),
  460. '<center<a>' => array(
  461. 'center',
  462. 'a',
  463. ),
  464. '<br<br>' => array(
  465. 'br',
  466. 'br',
  467. ),
  468. );
  469. foreach ($cases as $html => $expected) {
  470. $events = $this->parse($html);
  471. $this->assertEventError($events->get(0));
  472. $this->assertEventEquals('startTag', $expected[0], $events->get(1));
  473. $this->assertEventEquals('startTag', $expected[1], $events->get(2));
  474. $this->assertEventEquals('eof', null, $events->get(3));
  475. }
  476. $events = $this->parse('<span<>02</span>');
  477. $this->assertEventError($events->get(0));
  478. $this->assertEventEquals('startTag', 'span', $events->get(1));
  479. $this->assertEventError($events->get(2));
  480. $this->assertEventEquals('text', '>02', $events->get(3));
  481. $this->assertEventEquals('endTag', 'span', $events->get(4));
  482. $this->assertEventEquals('eof', null, $events->get(5));
  483. $events = $this->parse('<p</p>');
  484. $this->assertEventError($events->get(0));
  485. $this->assertEventEquals('startTag', 'p', $events->get(1));
  486. $this->assertEventEquals('endTag', 'p', $events->get(2));
  487. $this->assertEventEquals('eof', null, $events->get(3));
  488. $events = $this->parse('<strong><WordPress</strong>');
  489. $this->assertEventEquals('startTag', 'strong', $events->get(0));
  490. $this->assertEventError($events->get(1));
  491. $this->assertEventEquals('startTag', 'wordpress', $events->get(2));
  492. $this->assertEventEquals('endTag', 'strong', $events->get(3));
  493. $this->assertEventEquals('eof', null, $events->get(4));
  494. $events = $this->parse('<src=<a>');
  495. $this->assertEventError($events->get(0));
  496. $this->assertEventError($events->get(1));
  497. $this->assertEventError($events->get(2));
  498. $this->assertEventEquals('startTag', 'src', $events->get(3));
  499. $this->assertEventEquals('startTag', 'a', $events->get(4));
  500. $this->assertEventEquals('eof', null, $events->get(5));
  501. $events = $this->parse('<br...<a>');
  502. $this->assertEventError($events->get(0));
  503. $this->assertEventEquals('startTag', 'br', $events->get(1));
  504. $this->assertEventEquals('eof', null, $events->get(2));
  505. }
  506. public function testIllegalTagNames()
  507. {
  508. $cases = array(
  509. '<li">' => 'li',
  510. '<p">' => 'p',
  511. '<b&nbsp; >' => 'b',
  512. '<static*all>' => 'static',
  513. '<h*0720/>' => 'h',
  514. '<st*ATTRIBUTE />' => 'st',
  515. );
  516. foreach ($cases as $html => $expected) {
  517. $events = $this->parse($html);
  518. $this->assertEventError($events->get(0));
  519. $this->assertEventEquals('startTag', $expected, $events->get(1));
  520. }
  521. }
  522. /**
  523. * @depends testCharacterReference
  524. */
  525. public function testTagAttributes()
  526. {
  527. // Opening tags.
  528. $good = array(
  529. '<foo bar="baz">' => array(
  530. 'foo',
  531. array(
  532. 'bar' => 'baz',
  533. ),
  534. false,
  535. ),
  536. '<foo bar=" baz ">' => array(
  537. 'foo',
  538. array(
  539. 'bar' => ' baz ',
  540. ),
  541. false,
  542. ),
  543. "<foo bar=\"\nbaz\n\">" => array(
  544. 'foo',
  545. array(
  546. 'bar' => "\nbaz\n",
  547. ),
  548. false,
  549. ),
  550. "<foo bar='baz'>" => array(
  551. 'foo',
  552. array(
  553. 'bar' => 'baz',
  554. ),
  555. false,
  556. ),
  557. '<foo bar="A full sentence.">' => array(
  558. 'foo',
  559. array(
  560. 'bar' => 'A full sentence.',
  561. ),
  562. false,
  563. ),
  564. "<foo a='1' b=\"2\">" => array(
  565. 'foo',
  566. array(
  567. 'a' => '1',
  568. 'b' => '2',
  569. ),
  570. false,
  571. ),
  572. "<foo ns:bar='baz'>" => array(
  573. 'foo',
  574. array(
  575. 'ns:bar' => 'baz',
  576. ),
  577. false,
  578. ),
  579. "<foo a='blue&red'>" => array(
  580. 'foo',
  581. array(
  582. 'a' => 'blue&red',
  583. ),
  584. false,
  585. ),
  586. "<foo a='blue&amp;red'>" => array(
  587. 'foo',
  588. array(
  589. 'a' => 'blue&red',
  590. ),
  591. false,
  592. ),
  593. "<foo a='blue&&amp;&red'>" => array(
  594. 'foo',
  595. array(
  596. 'a' => 'blue&&&red',
  597. ),
  598. false,
  599. ),
  600. "<foo a='blue&&amp;red'>" => array(
  601. 'foo',
  602. array(
  603. 'a' => 'blue&&red',
  604. ),
  605. false,
  606. ),
  607. "<foo\nbar='baz'\n>" => array(
  608. 'foo',
  609. array(
  610. 'bar' => 'baz',
  611. ),
  612. false,
  613. ),
  614. '<doe a deer>' => array(
  615. 'doe',
  616. array(
  617. 'a' => null,
  618. 'deer' => null,
  619. ),
  620. false,
  621. ),
  622. '<foo bar=baz>' => array(
  623. 'foo',
  624. array(
  625. 'bar' => 'baz',
  626. ),
  627. false,
  628. ),
  629. // Updated for 8.1.2.3
  630. '<foo bar = "baz" >' => array(
  631. 'foo',
  632. array(
  633. 'bar' => 'baz',
  634. ),
  635. false,
  636. ),
  637. // The spec allows an unquoted value '/'. This will not be a closing
  638. // tag.
  639. '<foo bar=/>' => array(
  640. 'foo',
  641. array(
  642. 'bar' => '/',
  643. ),
  644. false,
  645. ),
  646. '<foo bar=baz/>' => array(
  647. 'foo',
  648. array(
  649. 'bar' => 'baz/',
  650. ),
  651. false,
  652. ),
  653. );
  654. $this->isAllGood('startTag', 2, $good);
  655. // Self-closing tags.
  656. $withEnd = array(
  657. '<foo bar="baz"/>' => array(
  658. 'foo',
  659. array(
  660. 'bar' => 'baz',
  661. ),
  662. true,
  663. ),
  664. '<foo BAR="baz"/>' => array(
  665. 'foo',
  666. array(
  667. 'bar' => 'baz',
  668. ),
  669. true,
  670. ),
  671. '<foo BAR="BAZ"/>' => array(
  672. 'foo',
  673. array(
  674. 'bar' => 'BAZ',
  675. ),
  676. true,
  677. ),
  678. "<foo a='1' b=\"2\" c=3 d/>" => array(
  679. 'foo',
  680. array(
  681. 'a' => '1',
  682. 'b' => '2',
  683. 'c' => '3',
  684. 'd' => null,
  685. ),
  686. true,
  687. ),
  688. );
  689. $this->isAllGood('startTag', 2, $withEnd);
  690. // Cause a parse error.
  691. $bad = array(
  692. // This will emit an entity lookup failure for &+dark.
  693. "<foo a='blue&+dark'>" => array(
  694. 'foo',
  695. array(
  696. 'a' => 'blue&+dark',
  697. ),
  698. false,
  699. ),
  700. '<foo bar=>' => array(
  701. 'foo',
  702. array(
  703. 'bar' => null,
  704. ),
  705. false,
  706. ),
  707. '<foo bar="oh' => array(
  708. 'foo',
  709. array(
  710. 'bar' => 'oh',
  711. ),
  712. false,
  713. ),
  714. '<foo bar=oh">' => array(
  715. 'foo',
  716. array(
  717. 'bar' => 'oh"',
  718. ),
  719. false,
  720. ),
  721. // these attributes are ignored because of current implementation
  722. // of method "DOMElement::setAttribute"
  723. // see issue #23: https://github.com/Masterminds/html5-php/issues/23
  724. '<foo b"="baz">' => array(
  725. 'foo',
  726. array(),
  727. false,
  728. ),
  729. '<foo 2abc="baz">' => array(
  730. 'foo',
  731. array(),
  732. false,
  733. ),
  734. '<foo ?="baz">' => array(
  735. 'foo',
  736. array(),
  737. false,
  738. ),
  739. '<foo foo?bar="baz">' => array(
  740. 'foo',
  741. array(),
  742. false,
  743. ),
  744. )
  745. ;
  746. foreach ($bad as $test => $expects) {
  747. $events = $this->parse($test);
  748. $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  749. $this->assertEventError($events->get(0));
  750. $this->assertEventEquals('startTag', $expects, $events->get(1));
  751. }
  752. // Cause multiple parse errors.
  753. $reallyBad = array(
  754. '<foo ="bar">' => array(
  755. 'foo',
  756. array(
  757. '=' => null,
  758. '"bar"' => null,
  759. ),
  760. false,
  761. ),
  762. '<foo////>' => array(
  763. 'foo',
  764. array(),
  765. true,
  766. ),
  767. // character "&" in unquoted attribute shouldn't cause an infinite loop
  768. '<foo bar=index.php?str=1&amp;id=29>' => array(
  769. 'foo',
  770. array(
  771. 'bar' => 'index.php?str=1&id=29',
  772. ),
  773. false,
  774. ),
  775. );
  776. foreach ($reallyBad as $test => $expects) {
  777. $events = $this->parse($test);
  778. // fprintf(STDOUT, $test . print_r($events, true));
  779. $this->assertEventError($events->get(0));
  780. $this->assertEventError($events->get(1));
  781. // $this->assertEventEquals('startTag', $expects, $events->get(1));
  782. }
  783. // Regression: Malformed elements should be detected.
  784. // '<foo baz="1" <bar></foo>' => array('foo', array('baz' => '1'), false),
  785. $events = $this->parse('<foo baz="1" <bar></foo>');
  786. $this->assertEventError($events->get(0));
  787. $this->assertEventEquals('startTag', array(
  788. 'foo',
  789. array(
  790. 'baz' => '1',
  791. ),
  792. false,
  793. ), $events->get(1));
  794. $this->assertEventEquals('startTag', array(
  795. 'bar',
  796. array(),
  797. false,
  798. ), $events->get(2));
  799. $this->assertEventEquals('endTag', array(
  800. 'foo',
  801. ), $events->get(3));
  802. }
  803. public function testRawText()
  804. {
  805. $good = array(
  806. '<script>abcd efg hijk lmnop</script> ' => 'abcd efg hijk lmnop',
  807. '<script><not/><the/><tag></script>' => '<not/><the/><tag>',
  808. '<script><<<<<<<<</script>' => '<<<<<<<<',
  809. '<script>hello</script</script>' => 'hello</script',
  810. "<script>\nhello</script\n</script>" => "\nhello</script\n",
  811. '<script>&amp;</script>' => '&amp;',
  812. '<script><!--not a comment--></script>' => '<!--not a comment-->',
  813. '<script><![CDATA[not a comment]]></script>' => '<![CDATA[not a comment]]>',
  814. );
  815. foreach ($good as $test => $expects) {
  816. $events = $this->parse($test);
  817. $this->assertEventEquals('startTag', 'script', $events->get(0));
  818. $this->assertEventEquals('text', $expects, $events->get(1));
  819. $this->assertEventEquals('endTag', 'script', $events->get(2));
  820. }
  821. $bad = array(
  822. '<script>&amp;</script' => '&amp;</script',
  823. '<script>Hello world' => 'Hello world',
  824. );
  825. foreach ($bad as $test => $expects) {
  826. $events = $this->parse($test);
  827. $this->assertEquals(4, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  828. $this->assertEventEquals('startTag', 'script', $events->get(0));
  829. $this->assertEventError($events->get(1));
  830. $this->assertEventEquals('text', $expects, $events->get(2));
  831. }
  832. // Testing case sensitivity
  833. $events = $this->parse('<TITLE>a test</TITLE>');
  834. $this->assertEventEquals('startTag', 'title', $events->get(0));
  835. $this->assertEventEquals('text', 'a test', $events->get(1));
  836. $this->assertEventEquals('endTag', 'title', $events->get(2));
  837. // Testing end tags with whitespaces
  838. $events = $this->parse('<title>Whitespaces are tasty</title >');
  839. $this->assertEventEquals('startTag', 'title', $events->get(0));
  840. $this->assertEventEquals('text', 'Whitespaces are tasty', $events->get(1));
  841. $this->assertEventEquals('endTag', 'title', $events->get(2));
  842. }
  843. public function testRcdata()
  844. {
  845. list($tok, $events) = $this->createTokenizer('<title>&#x27;<!-- not a comment --></TITLE>');
  846. $tok->setTextMode(\Masterminds\HTML5\Elements::TEXT_RCDATA, 'title');
  847. $tok->parse();
  848. $this->assertEventEquals('text', "'<!-- not a comment -->", $events->get(1));
  849. }
  850. public function testText()
  851. {
  852. $events = $this->parse('a<br>b');
  853. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  854. $this->assertEventEquals('text', 'a', $events->get(0));
  855. $this->assertEventEquals('startTag', 'br', $events->get(1));
  856. $this->assertEventEquals('text', 'b', $events->get(2));
  857. $events = $this->parse('<a>Test</a>');
  858. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  859. $this->assertEventEquals('startTag', 'a', $events->get(0));
  860. $this->assertEventEquals('text', 'Test', $events->get(1));
  861. $this->assertEventEquals('endTag', 'a', $events->get(2));
  862. $events = $this->parse('<p>0</p><p>1</p>');
  863. $this->assertEquals(7, $events->depth(), 'Events: ' . print_r($events, true));
  864. $this->assertEventEquals('startTag', 'p', $events->get(0));
  865. $this->assertEventEquals('text', '0', $events->get(1));
  866. $this->assertEventEquals('endTag', 'p', $events->get(2));
  867. $this->assertEventEquals('startTag', 'p', $events->get(3));
  868. $this->assertEventEquals('text', '1', $events->get(4));
  869. $this->assertEventEquals('endTag', 'p', $events->get(5));
  870. $events = $this->parse('a<![CDATA[test]]>b');
  871. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  872. $this->assertEventEquals('text', 'a', $events->get(0));
  873. $this->assertEventEquals('cdata', 'test', $events->get(1));
  874. $this->assertEventEquals('text', 'b', $events->get(2));
  875. $events = $this->parse('a<!--test-->b');
  876. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  877. $this->assertEventEquals('text', 'a', $events->get(0));
  878. $this->assertEventEquals('comment', 'test', $events->get(1));
  879. $this->assertEventEquals('text', 'b', $events->get(2));
  880. $events = $this->parse('a&amp;b');
  881. $this->assertEquals(2, $events->depth(), 'Events: ' . print_r($events, true));
  882. $this->assertEventEquals('text', 'a&b', $events->get(0));
  883. $events = $this->parse('a&sup2;b');
  884. $this->assertEquals(2, $events->depth(), 'Events: ' . print_r($events, true));
  885. $this->assertEventEquals('text', 'a²b', $events->get(0));
  886. }
  887. // ================================================================
  888. // Utility functions.
  889. // ================================================================
  890. protected function createTokenizer($string, $debug = false)
  891. {
  892. $eventHandler = new EventStack();
  893. $scanner = new Scanner($string);
  894. $scanner->debug = $debug;
  895. return array(
  896. new Tokenizer($scanner, $eventHandler),
  897. $eventHandler,
  898. );
  899. }
  900. public function parse($string, $debug = false)
  901. {
  902. list($tok, $events) = $this->createTokenizer($string, $debug);
  903. $tok->parse();
  904. return $events;
  905. }
  906. }