XMLParser.java 34 KB


  1. package gnu.xml;
  2. import java.io.*;
  3. import gnu.text.*;
  4. import gnu.lists.*;
  5. import gnu.kawa.io.BinaryInPort;
  6. import gnu.kawa.io.InPort;
  7. import gnu.kawa.io.Path;
  8. /** Reads XML from a char array.
  9. * Assumes a state-less character encoding containing ascii as a sub-set,
  10. * and where no byte in a multi-byte character is the same as a xml special
  11. * character. Any bytes with high-order bit set are treated as if they
  12. * are letters, and can be part of names.
  13. *
  14. * Handles CR/LF, CDATA, entity references, processing instructions, DOCTYPE,
  15. * as well as the obvious (text, element, and attributes).
  16. *
  17. * @author Per Bothner
  18. */
  19. public class XMLParser
  20. {
  21. private static final int EXPECT_NAME_MODIFIER = 1;
  22. private static final int SKIP_SPACES_MODIFIER = 2;
  23. private static final int INIT_STATE = 0;
  24. private static final int TEXT_STATE = 1;
  25. private static final int BEGIN_ELEMENT_STATE = 2;
  26. private static final int END_ELEMENT_STATE = 4;
  27. private static final int SAW_ENTITY_REF = 6; // Saw '&'.
  28. private static final int ATTRIBUTE_SEEN_NAME_STATE = 8;
  29. private static final int MAYBE_ATTRIBUTE_STATE = 10;
  30. private static final int ATTRIBUTE_SEEN_EQ_STATE = 11;
  31. private static final int DOCTYPE_SEEN_STATE = 13;
  32. private static final int DOCTYPE_NAME_SEEN_STATE = 16;
  33. private static final int SAW_LEFT_STATE = 14;
  34. private static final int SAW_LEFT_SLASH_STATE = 19; // Seen '</'
  35. private static final int SAW_LEFT_EXCL_STATE = 20;
  36. private static final int SAW_LEFT_QUEST_STATE = 21; // Seen '<?'
  37. private static final int SAW_LEFT_EXCL_MINUS_STATE = 22;
  38. private static final int SAW_AMP_STATE = 25; // Saw '&'.
  39. private static final int SAW_AMP_SHARP_STATE = 26; // Saw '&#'.
  40. private static final int EXPECT_RIGHT_STATE = 27;
  41. private static final int PREV_WAS_CR_STATE = 28;
  42. private static final int INIT_LEFT_QUEST_STATE = 30;
  43. private static final int INIT_TEXT_STATE = 31;
  44. private static final int INIT_LEFT_STATE = 34;
  45. private static final int INVALID_VERSION_DECL = 35;
  46. private static final int SAW_ERROR = 36;
  47. private static final int SAW_EOF_ERROR = 37; // Unexpected end-of-file.
  48. private static final int MISSING_XML_DECL = 38;
  49. static final String BAD_ENCODING_SYNTAX = "bad 'encoding' declaration";
  50. static final String BAD_STANDALONE_SYNTAX = "bad 'standalone' declaration";
  51. public static void parse (Object uri, SourceMessages messages, Consumer out)
  52. throws java.io.IOException
  53. {
  54. parse(Path.openInputStream(uri), uri, messages, out);
  55. }
  56. public static BinaryInPort XMLStreamReader(InputStream strm)
  57. throws java.io.IOException {
  58. BinaryInPort in = new BinaryInPort(strm);
  59. in.setFromByteOrderMark();
  60. in.setKeepFullLines(false);
  61. return in;
  62. }
  63. public static void parse (InputStream strm, Object uri,
  64. SourceMessages messages, Consumer out)
  65. throws java.io.IOException
  66. {
  67. BinaryInPort in = XMLStreamReader(strm);
  68. if (uri != null)
  69. in.setName(uri);
  70. parse(in, messages, out);
  71. in.close();
  72. }
  73. public static void parse (InPort in, SourceMessages messages, Consumer out)
  74. throws java.io.IOException
  75. {
  76. XMLFilter filter = new XMLFilter(out);
  77. filter.setMessages(messages);
  78. filter.setSourceLocator(in);
  79. filter.startDocument();
  80. Object uri = in.getPath();
  81. if (uri != null)
  82. filter.writeDocumentUri(uri);
  83. parse(in, filter);
  84. filter.endDocument();
  85. }
  86. public static void parse (InPort in, SourceMessages messages, XMLFilter filter)
  87. throws java.io.IOException
  88. {
  89. filter.setMessages(messages);
  90. filter.setSourceLocator(in);
  91. filter.startDocument();
  92. Object uri = in.getPath();
  93. if (uri != null)
  94. filter.writeDocumentUri(uri);
  95. parse(in, filter);
  96. filter.endDocument();
  97. in.close();
  98. }
  99. public static void parse(InPort in, XMLFilter out)
  100. {
  101. // Cache fields in local variables, for speed.
  102. char[] buffer = in.buffer;
  103. int pos = in.pos;
  104. int limit = in.limit;
  105. boolean strict = false;
  106. // The flow logic of this method is unusual. It is one big state machine,
  107. // but with two "subroutines": SKIP_SPACES_MODIFIER and EXPECT_NAME_MODIFIER.
  108. // There is also a "subroutine" to get a new character (and leave it in 'ch')
  109. // when 'break handleChar' is executed, except this has the hard-wired
  110. // continuation of switching on the 'state'.
  111. //
  112. // The justification for this rather usual design is performance.
  113. // As long as the input is contained within 'buffer', we don't need
  114. // to call input methods (only methods for emitting parsed data is
  115. // called). We also maximize use of local variables - we do not
  116. // access any object fields (including fields of 'this') except
  117. // for getting the next char from 'buffer'. These properties mean
  118. // this method can be compiled to very tight efficient code.
  119. int state = INIT_STATE;
  120. // 0: normal - in character context.
  121. // 1: seen '&'
  122. // The next two varibles are only relevant if state==INIT_STATE:
  123. char terminator = (char) '<';
  124. int continue_state = SAW_LEFT_STATE;
  125. char ch = (char) ' '; // ???
  126. int length = 0;
  127. int dstart = -1;
  128. String message = null;
  129. int start = -1;
  130. mainLoop:
  131. for (;;)
  132. {
  133. handleChar: // When done get next character.
  134. switch (state)
  135. {
  136. case INIT_STATE:
  137. state = INIT_TEXT_STATE;
  138. break handleChar;
  139. case INIT_TEXT_STATE:
  140. if (ch == '<')
  141. {
  142. state = INIT_LEFT_STATE;
  143. break handleChar;
  144. }
  145. state = strict ? MISSING_XML_DECL : TEXT_STATE;
  146. continue mainLoop;
  147. case INIT_LEFT_STATE:
  148. if (ch == '?')
  149. {
  150. start = pos;
  151. state = EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER + INIT_LEFT_QUEST_STATE;
  152. break handleChar;
  153. }
  154. state = strict ? MISSING_XML_DECL : SAW_LEFT_STATE;
  155. continue mainLoop;
  156. case MISSING_XML_DECL:
  157. message = "missing XML declaration";
  158. state = SAW_ERROR;
  159. continue mainLoop;
  160. case INVALID_VERSION_DECL:
  161. pos = dstart;
  162. message = "invalid xml version specifier";
  163. state = SAW_ERROR;
  164. continue mainLoop;
  165. case SAW_ERROR:
  166. in.pos = pos;
  167. start = -1;
  168. out.error('e', message);
  169. for (;;)
  170. {
  171. if (pos >= limit)
  172. break mainLoop;
  173. ch = buffer[pos++];
  174. if (ch == '>')
  175. {
  176. state = TEXT_STATE;
  177. break handleChar;
  178. }
  179. }
  180. case SAW_EOF_ERROR:
  181. in.pos = pos;
  182. out.error('f', "unexpected end-of-file");
  183. return;
  184. case TEXT_STATE:
  185. // This state handle text not inside tags (in which case
  186. // terminator=='<'). It also handles attribute values (in
  187. // which case terminator is '\'' or '"').
  188. start = pos - 1;
  189. // Not length now, but used to calculate length when done.
  190. length = pos;
  191. for (;;)
  192. {
  193. if (ch == terminator)
  194. {
  195. state = continue_state;
  196. break;
  197. }
  198. if (ch == '&')
  199. {
  200. state = SAW_AMP_STATE;
  201. break;
  202. }
  203. if (ch == '\r')
  204. {
  205. length = pos - length;
  206. in.pos = pos;
  207. if (length > 0)
  208. out.textFromParser(buffer, start, length);
  209. if (pos < limit)
  210. {
  211. ch = buffer[pos];
  212. if (ch == '\n')
  213. {
  214. start = pos;
  215. length = ++pos;
  216. }
  217. else
  218. {
  219. out.linefeedFromParser();
  220. if (ch == 0x85)
  221. {
  222. start = pos++;
  223. length = pos + 1;
  224. }
  225. else
  226. {
  227. in.incrLineNumber(1, pos);
  228. start = pos;
  229. length = ++pos;
  230. continue;
  231. }
  232. }
  233. in.incrLineNumber(1, pos);
  234. }
  235. else
  236. {
  237. out.linefeedFromParser();
  238. state = PREV_WAS_CR_STATE;
  239. break handleChar;
  240. }
  241. }
  242. else if (ch == 0x85 || ch == 0x2028)
  243. {
  244. length = pos - length;
  245. in.pos = pos-1;
  246. if (length > 0)
  247. out.textFromParser(buffer, start, length);
  248. out.linefeedFromParser();
  249. in.incrLineNumber(1, pos);
  250. length = pos + 1;
  251. start = pos;
  252. }
  253. else if (ch == '\n')
  254. {
  255. in.incrLineNumber(1, pos);
  256. }
  257. if (pos == limit)
  258. {
  259. length--;
  260. break;
  261. }
  262. ch = buffer[pos++];
  263. }
  264. length = pos - length;
  265. if (length > 0)
  266. {
  267. in.pos = pos;
  268. out.textFromParser(buffer, start, length);
  269. }
  270. start = -1;
  271. break handleChar;
  272. case PREV_WAS_CR_STATE:
  273. // The previous character was a '\r', and we passed along '\n'
  274. // to out. If the new character is '\n' or 0x85 ignore it.
  275. state = TEXT_STATE;
  276. if (ch == '\n' || ch == 0x85)
  277. {
  278. in.incrLineNumber(1, pos);
  279. break handleChar;
  280. }
  281. else
  282. {
  283. in.incrLineNumber(1, pos-1);
  284. continue;
  285. }
  286. case SKIP_SPACES_MODIFIER + EXPECT_RIGHT_STATE:
  287. case SKIP_SPACES_MODIFIER + MAYBE_ATTRIBUTE_STATE:
  288. case SKIP_SPACES_MODIFIER + SAW_LEFT_QUEST_STATE:
  289. case SKIP_SPACES_MODIFIER + INIT_LEFT_QUEST_STATE:
  290. case SKIP_SPACES_MODIFIER + DOCTYPE_SEEN_STATE:
  291. // "Subroutine" for skipping whitespace.
  292. if (ch == ' ' || ch == '\t')
  293. break handleChar;
  294. if (ch == '\n' || ch == '\r'
  295. || ch == '\u0085' || ch == '\u2028')
  296. {
  297. in.incrLineNumber(1, pos);
  298. break handleChar;
  299. }
  300. // Not a space, so "return" to next state.
  301. state -= SKIP_SPACES_MODIFIER;
  302. continue mainLoop;
  303. case EXPECT_NAME_MODIFIER + BEGIN_ELEMENT_STATE:
  304. case EXPECT_NAME_MODIFIER + END_ELEMENT_STATE:
  305. case EXPECT_NAME_MODIFIER + ATTRIBUTE_SEEN_NAME_STATE:
  306. case EXPECT_NAME_MODIFIER + SAW_ENTITY_REF:
  307. case EXPECT_NAME_MODIFIER + DOCTYPE_NAME_SEEN_STATE:
  308. case EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER + SAW_LEFT_QUEST_STATE:
  309. case EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER + INIT_LEFT_QUEST_STATE:
  310. length = start+1;
  311. // "Subroutine" for reading a Name.
  312. for (;;)
  313. {
  314. // XML 1.1 candidate recommendation:
  315. // [2] Char ::= #x9 | #xA | #xD | [#x20-#x7E] | #x85
  316. // | [#xA0-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
  317. // [4] NameStartChar := ":" | [A-Z] | "_" | [a-z] |
  318. // [#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
  319. // [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
  320. // [#x3001-#xD7FF] | [#xF900-#xEFFFF]
  321. // [4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 |
  322. // [#x0300-#x036F] | [#x203F-#x2040]
  323. if ((ch >= 'a' && ch <= 'z') ||
  324. (ch >= 'A' && ch <= 'Z') ||
  325. ch == '_' || ch == ':' ||
  326. (ch >= 0xC0 && (ch <= 0x2FF ||
  327. (ch >= 0x370 &&
  328. ((ch <= 0x1FFF && ch != 0x37E) ||
  329. (ch >= 0x200C &&
  330. (ch <= 0x200D ||
  331. (ch >= 0x2070 && ch <= 0x218F)||
  332. (ch >= 0x2C00 && ch <= 0x2FEF) ||
  333. (ch >= 0x3001 && ch <= 0xD7FF) ||
  334. (ch >= 0xF900 && ch <= 0xFFFD))))))) ||
  335. (pos > length &&
  336. (ch >= '0' && ch <= '9') ||
  337. ch == '.' || ch == '-' ||
  338. ch == 0xB7 ||
  339. (ch > 0x300 &&
  340. (ch <= 0x36F || (ch >= 0x203F && ch <= 0x2040)))))
  341. {
  342. }
  343. else
  344. {
  345. state -= EXPECT_NAME_MODIFIER;
  346. length = pos - length;
  347. if (length == 0)
  348. {
  349. if (state == ATTRIBUTE_SEEN_NAME_STATE)
  350. message = "missing or invalid attribute name";
  351. else if (state == BEGIN_ELEMENT_STATE
  352. || state == END_ELEMENT_STATE)
  353. message = "missing or invalid element name";
  354. else
  355. message = "missing or invalid name";
  356. state = SAW_ERROR;
  357. }
  358. continue mainLoop;
  359. }
  360. if (pos < limit)
  361. ch = buffer[pos++];
  362. else
  363. break handleChar;
  364. }
  365. case SAW_AMP_SHARP_STATE:
  366. for (;;)
  367. {
  368. if (ch == ';')
  369. {
  370. in.pos = pos;
  371. out.emitCharacterReference(length,
  372. buffer, start, pos-1-start);
  373. state = TEXT_STATE;
  374. break handleChar;
  375. }
  376. if (ch == 'x' && dstart == 0)
  377. dstart = 16;
  378. else if (length >= 0x8000000)
  379. break; // Overflow likely.
  380. else
  381. {
  382. int base = dstart == 0 ? 10 : dstart;
  383. int digit = Character.digit((char) ch, base);
  384. if (digit < 0)
  385. break;
  386. length = length * base + digit;
  387. }
  388. if (pos < limit)
  389. ch = buffer[pos++];
  390. else
  391. break handleChar;
  392. }
  393. in.pos = pos;
  394. out.error('e', "invalid character reference");
  395. state = TEXT_STATE;
  396. break handleChar;
  397. case SAW_AMP_STATE:
  398. if (ch == '#')
  399. {
  400. state = SAW_AMP_SHARP_STATE;
  401. start = pos;
  402. length = 0; // accumulated value; -1 means error, -2 overflow
  403. dstart = 0; // base - 0 means not seen yet
  404. break handleChar;
  405. }
  406. start = pos - 1;
  407. state = EXPECT_NAME_MODIFIER + SAW_ENTITY_REF;
  408. continue mainLoop;
  409. case SAW_ENTITY_REF:
  410. in.pos = pos;
  411. if (ch != ';')
  412. out.error('w', "missing ';'");
  413. out.emitEntityReference(buffer, start, length);
  414. start = -1;
  415. state = TEXT_STATE;
  416. break handleChar;
  417. case SAW_LEFT_STATE: // Saw '<'
  418. if (ch == '/')
  419. {
  420. state = SAW_LEFT_SLASH_STATE;
  421. break handleChar;
  422. }
  423. if (ch == '?')
  424. {
  425. start = pos;
  426. state = EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER + SAW_LEFT_QUEST_STATE;
  427. break handleChar;
  428. }
  429. if (ch == '!')
  430. {
  431. state = SAW_LEFT_EXCL_STATE;
  432. start = pos;
  433. break handleChar;
  434. }
  435. // Read Name then goto BEGIN_ELEMENT_STATE.
  436. start = pos - 1;
  437. state = EXPECT_NAME_MODIFIER + BEGIN_ELEMENT_STATE;
  438. continue mainLoop;
  439. case BEGIN_ELEMENT_STATE:
  440. in.pos = pos-length; // position of start of name, for errors.
  441. out.emitStartElement(buffer, start, length);
  442. state = SKIP_SPACES_MODIFIER + MAYBE_ATTRIBUTE_STATE;
  443. start = -1;
  444. continue mainLoop;
  445. case SAW_LEFT_QUEST_STATE: // Seen '<?' Name Spaces
  446. case INIT_LEFT_QUEST_STATE: // Seen '<?' Name Spaces
  447. if (dstart < 0)
  448. dstart = pos - 1;
  449. for (;;)
  450. {
  451. int end;
  452. if (ch == '>'
  453. && buffer[end = pos - 2] == '?'
  454. && end >= dstart)
  455. {
  456. in.pos = pos;
  457. if (length == 3
  458. && buffer[start] == 'x'
  459. && buffer[start+1] == 'm'
  460. && buffer[start+2] == 'l')
  461. {
  462. if (state == INIT_LEFT_QUEST_STATE)
  463. {
  464. if (end <= dstart+7
  465. || buffer[dstart] != 'v'
  466. || buffer[dstart+1] != 'e'
  467. || buffer[dstart+2] != 'r'
  468. || buffer[dstart+3] != 's'
  469. || buffer[dstart+4] != 'i'
  470. || buffer[dstart+5] != 'o'
  471. || buffer[dstart+6] != 'n')
  472. {
  473. // FIXME should allow if !strict
  474. pos = dstart;
  475. message = "xml declaration without version";
  476. state = SAW_ERROR;
  477. continue mainLoop;
  478. }
  479. dstart += 7;
  480. ch = buffer[dstart];
  481. while (Character.isWhitespace(ch)
  482. && ++dstart < end)
  483. ch = buffer[dstart];
  484. if (ch != '=')
  485. {
  486. state = INVALID_VERSION_DECL;
  487. continue mainLoop;
  488. }
  489. ch = buffer[++dstart];
  490. while (Character.isWhitespace(ch)
  491. && ++dstart < end)
  492. ch = buffer[dstart];
  493. if (ch != '\'' && ch != '\"')
  494. {
  495. state = INVALID_VERSION_DECL;
  496. continue mainLoop;
  497. }
  498. char quote = ch;
  499. int i = ++dstart;
  500. for (;; i++)
  501. {
  502. if (i == end)
  503. {
  504. state = INVALID_VERSION_DECL;
  505. continue mainLoop;
  506. }
  507. ch = buffer[i];
  508. if (ch == quote)
  509. break;
  510. }
  511. if (i == dstart + 3 && buffer[dstart] == '1'
  512. && buffer[dstart+1] == '.'
  513. && (ch = buffer[dstart+2]) == '0' || ch == '1')
  514. {
  515. // Save version number, if that is useful.
  516. }
  517. else
  518. {
  519. state = INVALID_VERSION_DECL;
  520. continue mainLoop;
  521. }
  522. dstart = i+1;
  523. while (dstart < end
  524. && Character.isWhitespace(buffer[dstart]))
  525. dstart++;
  526. if (end > dstart + 7
  527. && buffer[dstart] == 'e'
  528. && buffer[dstart+1] == 'n'
  529. && buffer[dstart+2] == 'c'
  530. && buffer[dstart+3] == 'o'
  531. && buffer[dstart+4] == 'd'
  532. && buffer[dstart+5] == 'i'
  533. && buffer[dstart+6] == 'n'
  534. && buffer[dstart+7] == 'g')
  535. {
  536. dstart += 8;
  537. ch = buffer[dstart];
  538. while (Character.isWhitespace(ch)
  539. && ++dstart < end)
  540. ch = buffer[dstart];
  541. if (ch != '=')
  542. {
  543. message = BAD_ENCODING_SYNTAX;
  544. state = SAW_ERROR;
  545. continue mainLoop;
  546. }
  547. ch = buffer[++dstart];
  548. while (Character.isWhitespace(ch)
  549. && ++dstart < end)
  550. ch = buffer[dstart];
  551. if (ch != '\'' && ch != '\"')
  552. {
  553. message = BAD_ENCODING_SYNTAX;
  554. state = SAW_ERROR;
  555. continue mainLoop;
  556. }
  557. quote = ch;
  558. i = ++dstart;
  559. for (;; i++)
  560. {
  561. if (i == end)
  562. {
  563. message = BAD_ENCODING_SYNTAX;
  564. state = SAW_ERROR;
  565. continue mainLoop;
  566. }
  567. ch = buffer[i];
  568. if (ch == quote)
  569. break;
  570. }
  571. String encoding = new String(buffer,dstart, i-dstart);
  572. if (in instanceof BinaryInPort)
  573. ((BinaryInPort) in).setCharset(encoding);
  574. dstart = i+1;
  575. while (dstart < end
  576. && Character.isWhitespace(buffer[dstart]))
  577. dstart++;
  578. }
  579. if (end > dstart + 9
  580. && buffer[dstart] == 's'
  581. && buffer[dstart+1] == 't'
  582. && buffer[dstart+2] == 'a'
  583. && buffer[dstart+3] == 'n'
  584. && buffer[dstart+4] == 'd'
  585. && buffer[dstart+5] == 'a'
  586. && buffer[dstart+6] == 'l'
  587. && buffer[dstart+7] == 'o'
  588. && buffer[dstart+8] == 'n'
  589. && buffer[dstart+9] == 'e')
  590. {
  591. dstart += 10;
  592. ch = buffer[dstart];
  593. while (Character.isWhitespace(ch)
  594. && ++dstart < end)
  595. ch = buffer[dstart];
  596. if (ch != '=')
  597. {
  598. message = BAD_STANDALONE_SYNTAX;
  599. state = SAW_ERROR;
  600. continue mainLoop;
  601. }
  602. ch = buffer[++dstart];
  603. while (Character.isWhitespace(ch)
  604. && ++dstart < end)
  605. ch = buffer[dstart];
  606. if (ch != '\'' && ch != '\"')
  607. {
  608. message = BAD_STANDALONE_SYNTAX;
  609. state = SAW_ERROR;
  610. continue mainLoop;
  611. }
  612. quote = ch;
  613. i = ++dstart;
  614. for (;; i++)
  615. {
  616. if (i == end)
  617. {
  618. message = BAD_STANDALONE_SYNTAX;
  619. state = SAW_ERROR;
  620. continue mainLoop;
  621. }
  622. ch = buffer[i];
  623. if (ch == quote)
  624. break;
  625. }
  626. if (i == dstart+3
  627. && buffer[dstart] == 'y'
  628. && buffer[dstart+1] == 'e'
  629. && buffer[dstart+2] == 's')
  630. {
  631. }
  632. else if (i == dstart+2
  633. && buffer[dstart] == 'n'
  634. && buffer[dstart+1] == 'o')
  635. {
  636. }
  637. else
  638. {
  639. message = BAD_STANDALONE_SYNTAX;
  640. state = SAW_ERROR;
  641. continue mainLoop;
  642. }
  643. dstart = i+1;
  644. while (dstart < end
  645. && Character.isWhitespace(buffer[dstart]))
  646. dstart++;
  647. }
  648. if (end != dstart)
  649. {
  650. message = "junk at end of xml declaration";
  651. pos = dstart;
  652. state = SAW_ERROR;
  653. continue mainLoop;
  654. }
  655. }
  656. else
  657. {
  658. message = "<?xml must be at start of file";
  659. state = SAW_ERROR;
  660. continue mainLoop;
  661. }
  662. }
  663. else if (strict && state == INIT_LEFT_QUEST_STATE)
  664. {
  665. state = MISSING_XML_DECL;
  666. continue mainLoop;
  667. }
  668. else
  669. out.processingInstructionFromParser(buffer, start, length,
  670. dstart, end - dstart);
  671. start = -1;
  672. dstart = -1;
  673. state = TEXT_STATE;
  674. break handleChar;
  675. }
  676. if (pos < limit)
  677. ch = buffer[pos++];
  678. else
  679. break handleChar;
  680. }
  681. case SAW_LEFT_EXCL_STATE: // Seen '<!'
  682. exclLoop:
  683. for (;;)
  684. {
  685. if (ch == '>')
  686. {
  687. length = pos - 1 - start;
  688. if (length >= 4
  689. && buffer[start] == '-'
  690. && buffer[start+1] == '-')
  691. {
  692. if (buffer[pos-2] == '-'
  693. && buffer[pos-3] == '-')
  694. {
  695. in.pos = pos;
  696. out.commentFromParser(buffer, start + 2, length - 4);
  697. start = -1;
  698. break exclLoop;
  699. }
  700. }
  701. else if (length >= 6
  702. && buffer[start] == '['
  703. && buffer[start+1] == 'C'
  704. && buffer[start+2] == 'D'
  705. && buffer[start+3] == 'A'
  706. && buffer[start+4] == 'T'
  707. && buffer[start+5] == 'A'
  708. && buffer[start+6] == '[')
  709. {
  710. if (buffer[pos-2] == ']'
  711. && buffer[pos-3] == ']')
  712. {
  713. in.pos = pos;
  714. out.writeCDATA(buffer, start + 7, pos - 10 - start);
  715. start = -1;
  716. break exclLoop;
  717. }
  718. }
  719. else
  720. {
  721. // FIXME ignoreing <!ELEMENT ... > etc.
  722. break exclLoop;
  723. }
  724. }
  725. else if (pos == start+7
  726. && buffer[start] == 'D'
  727. && buffer[start+1] == 'O'
  728. && buffer[start+2] == 'C'
  729. && buffer[start+3] == 'T'
  730. && buffer[start+4] == 'Y'
  731. && buffer[start+5] == 'P'
  732. && ch == 'E')
  733. {
  734. start = -1;
  735. state = SKIP_SPACES_MODIFIER + DOCTYPE_SEEN_STATE;
  736. break handleChar;
  737. }
  738. if (pos < limit)
  739. ch = buffer[pos++];
  740. else
  741. break handleChar;
  742. }
  743. start = -1;
  744. state = TEXT_STATE;
  745. break handleChar;
  746. case DOCTYPE_SEEN_STATE: /* Seen '<!DOCTYPE' S* */
  747. state = EXPECT_NAME_MODIFIER + DOCTYPE_NAME_SEEN_STATE;
  748. start = pos - 1;
  749. continue mainLoop;
  750. case DOCTYPE_NAME_SEEN_STATE: /* Seen '<!DOCTYPE' S* Name */
  751. if (dstart < 0)
  752. {
  753. // First type - i.e. not after a handelChar call.
  754. dstart = pos - 1;
  755. dstart -= start; // Make relative.
  756. dstart <<= 1; // Add bit for whether in a '['.
  757. terminator = 0;
  758. }
  759. for (;;)
  760. {
  761. if (ch == '\'' || ch == '\"')
  762. {
  763. if (terminator == 0)
  764. terminator = ch;
  765. else if (terminator == ch)
  766. terminator = 0;
  767. }
  768. else if (terminator == 0) // I.e. not inside a string.
  769. {
  770. // Low-order bit of dstart is 1 if we've seen a '['.
  771. if (ch == '[')
  772. dstart |= 1;
  773. else if (ch == ']')
  774. dstart &= ~1;
  775. else if (ch == '>' && (dstart & 1) == 0)
  776. {
  777. in.pos = pos;
  778. dstart >>= 1;
  779. dstart += start;
  780. out.emitDoctypeDecl(buffer, start, length,
  781. dstart, pos - 1 - dstart);
  782. terminator = (char) '<';
  783. start = -1;
  784. dstart = -1;
  785. state = TEXT_STATE;
  786. break handleChar;
  787. }
  788. }
  789. if (pos < limit)
  790. ch = buffer[pos++];
  791. else
  792. break handleChar;
  793. }
  794. case MAYBE_ATTRIBUTE_STATE:
  795. terminator = '<';
  796. continue_state = SAW_LEFT_STATE;
  797. if (ch == '/')
  798. {
  799. in.pos = pos;
  800. out.emitEndAttributes();
  801. out.emitEndElement(null, 0, 0);
  802. state = EXPECT_RIGHT_STATE;
  803. break handleChar;
  804. }
  805. if (ch == '>')
  806. {
  807. in.pos = pos;
  808. out.emitEndAttributes();
  809. state = TEXT_STATE;
  810. break handleChar;
  811. }
  812. start = pos - 1;
  813. state = EXPECT_NAME_MODIFIER + ATTRIBUTE_SEEN_NAME_STATE;
  814. continue mainLoop;
  815. case ATTRIBUTE_SEEN_NAME_STATE:
  816. if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'
  817. || ch == '\u0085' || ch == '\u2028')
  818. break handleChar;
  819. in.pos = pos-length; // position of start of name, for errors.
  820. out.emitStartAttribute(buffer, start, length);
  821. start = -1;
  822. if (ch == '=')
  823. {
  824. state = ATTRIBUTE_SEEN_EQ_STATE;
  825. break handleChar;
  826. }
  827. out.emitEndAttributes();
  828. message = "missing or misplaced '=' after attribute name";
  829. state = SAW_ERROR;
  830. continue mainLoop;
  831. case ATTRIBUTE_SEEN_EQ_STATE:
  832. if (ch == '\'' || ch == '\"')
  833. {
  834. terminator = ch;
  835. continue_state = SKIP_SPACES_MODIFIER + MAYBE_ATTRIBUTE_STATE;
  836. state = TEXT_STATE;
  837. break handleChar;
  838. }
  839. if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'
  840. || ch == '\u0085' || ch == '\u2028')
  841. break handleChar;
  842. out.emitEndAttributes();
  843. message = "missing or unquoted attribute value";
  844. state = SAW_ERROR;
  845. continue mainLoop;
  846. case SAW_LEFT_SLASH_STATE: // Seen '</'.
  847. // Do "Name" subroutine, then goto END_ELEMENT_STATE.
  848. start = pos - 1;
  849. state = EXPECT_NAME_MODIFIER + END_ELEMENT_STATE;
  850. continue mainLoop;
  851. case END_ELEMENT_STATE: // Seen '</' Name.
  852. in.pos = pos;
  853. out.emitEndElement(buffer, start, length);
  854. start = -1;
  855. // Skip spaces then goto EXPECT_RIGHT_STATE.
  856. state = SKIP_SPACES_MODIFIER + EXPECT_RIGHT_STATE;
  857. continue mainLoop;
  858. case EXPECT_RIGHT_STATE: // Looking for '>'.
  859. if (ch != '>')
  860. {
  861. message = "missing '>'";
  862. state = SAW_ERROR;
  863. continue mainLoop;
  864. }
  865. state = TEXT_STATE;
  866. break handleChar;
  867. }
  868. // After 'break handleChar', we get here.
  869. if (pos >= limit)
  870. {
  871. int saved = pos - start;
  872. try
  873. {
  874. if (start >= 0)
  875. {
  876. in.setSaveStart(start);
  877. }
  878. in.pos = pos;
  879. int x = in.peek();
  880. if (x < 0)
  881. {
  882. if (state == TEXT_STATE || state == PREV_WAS_CR_STATE)
  883. return;
  884. state = SAW_EOF_ERROR;
  885. continue;
  886. }
  887. if (start >= 0)
  888. {
  889. in.setSaveStart(-1);
  890. }
  891. }
  892. catch (java.io.IOException ex)
  893. {
  894. throw new RuntimeException(ex.getMessage());
  895. }
  896. pos = in.pos;
  897. buffer = in.buffer;
  898. limit = in.limit;
  899. start = start >= 0 ? pos - saved : limit;
  900. }
  901. ch = buffer[pos++];
  902. }
  903. }
  904. }