scanner.py 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449
  1. # Scanner produces tokens of the following types:
  2. # STREAM-START
  3. # STREAM-END
  4. # DIRECTIVE(name, value)
  5. # DOCUMENT-START
  6. # DOCUMENT-END
  7. # BLOCK-SEQUENCE-START
  8. # BLOCK-MAPPING-START
  9. # BLOCK-END
  10. # FLOW-SEQUENCE-START
  11. # FLOW-MAPPING-START
  12. # FLOW-SEQUENCE-END
  13. # FLOW-MAPPING-END
  14. # BLOCK-ENTRY
  15. # FLOW-ENTRY
  16. # KEY
  17. # VALUE
  18. # ALIAS(value)
  19. # ANCHOR(value)
  20. # TAG(value)
  21. # SCALAR(value, plain, style)
  22. #
  23. # Read comments in the Scanner code for more details.
  24. #
  25. __all__ = ['Scanner', 'ScannerError']
  26. from .error import MarkedYAMLError
  27. from .tokens import *
  28. class ScannerError(MarkedYAMLError):
  29. pass
  30. class SimpleKey:
  31. # See below simple keys treatment.
  32. def __init__(self, token_number, required, index, line, column, mark):
  33. self.token_number = token_number
  34. self.required = required
  35. self.index = index
  36. self.line = line
  37. self.column = column
  38. self.mark = mark
  39. class Scanner:
  40. def __init__(self):
  41. """Initialize the scanner."""
  42. # It is assumed that Scanner and Reader will have a common descendant.
  43. # Reader do the dirty work of checking for BOM and converting the
  44. # input data to Unicode. It also adds NUL to the end.
  45. #
  46. # Reader supports the following methods
  47. # self.peek(i=0) # peek the next i-th character
  48. # self.prefix(l=1) # peek the next l characters
  49. # self.forward(l=1) # read the next l characters and move the pointer.
  50. # Had we reached the end of the stream?
  51. self.done = False
  52. # The number of unclosed '{' and '['. `flow_level == 0` means block
  53. # context.
  54. self.flow_level = 0
  55. # List of processed tokens that are not yet emitted.
  56. self.tokens = []
  57. # Add the STREAM-START token.
  58. self.fetch_stream_start()
  59. # Number of tokens that were emitted through the `get_token` method.
  60. self.tokens_taken = 0
  61. # The current indentation level.
  62. self.indent = -1
  63. # Past indentation levels.
  64. self.indents = []
  65. # Variables related to simple keys treatment.
  66. # A simple key is a key that is not denoted by the '?' indicator.
  67. # Example of simple keys:
  68. # ---
  69. # block simple key: value
  70. # ? not a simple key:
  71. # : { flow simple key: value }
  72. # We emit the KEY token before all keys, so when we find a potential
  73. # simple key, we try to locate the corresponding ':' indicator.
  74. # Simple keys should be limited to a single line and 1024 characters.
  75. # Can a simple key start at the current position? A simple key may
  76. # start:
  77. # - at the beginning of the line, not counting indentation spaces
  78. # (in block context),
  79. # - after '{', '[', ',' (in the flow context),
  80. # - after '?', ':', '-' (in the block context).
  81. # In the block context, this flag also signifies if a block collection
  82. # may start at the current position.
  83. self.allow_simple_key = True
  84. # Keep track of possible simple keys. This is a dictionary. The key
  85. # is `flow_level`; there can be no more that one possible simple key
  86. # for each level. The value is a SimpleKey record:
  87. # (token_number, required, index, line, column, mark)
  88. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  89. # '[', or '{' tokens.
  90. self.possible_simple_keys = {}
  91. # Public methods.
  92. def check_token(self, *choices):
  93. # Check if the next token is one of the given types.
  94. while self.need_more_tokens():
  95. self.fetch_more_tokens()
  96. if self.tokens:
  97. if not choices:
  98. return True
  99. for choice in choices:
  100. if isinstance(self.tokens[0], choice):
  101. return True
  102. return False
  103. def peek_token(self):
  104. # Return the next token, but do not delete if from the queue.
  105. while self.need_more_tokens():
  106. self.fetch_more_tokens()
  107. if self.tokens:
  108. return self.tokens[0]
  109. def get_token(self):
  110. # Return the next token.
  111. while self.need_more_tokens():
  112. self.fetch_more_tokens()
  113. if self.tokens:
  114. self.tokens_taken += 1
  115. return self.tokens.pop(0)
  116. # Private methods.
  117. def need_more_tokens(self):
  118. if self.done:
  119. return False
  120. if not self.tokens:
  121. return True
  122. # The current token may be a potential simple key, so we
  123. # need to look further.
  124. self.stale_possible_simple_keys()
  125. if self.next_possible_simple_key() == self.tokens_taken:
  126. return True
  127. def fetch_more_tokens(self):
  128. # Eat whitespaces and comments until we reach the next token.
  129. self.scan_to_next_token()
  130. # Remove obsolete possible simple keys.
  131. self.stale_possible_simple_keys()
  132. # Compare the current indentation and column. It may add some tokens
  133. # and decrease the current indentation level.
  134. self.unwind_indent(self.column)
  135. # Peek the next character.
  136. ch = self.peek()
  137. # Is it the end of stream?
  138. if ch == '\0':
  139. return self.fetch_stream_end()
  140. # Is it a directive?
  141. if ch == '%' and self.check_directive():
  142. return self.fetch_directive()
  143. # Is it the document start?
  144. if ch == '-' and self.check_document_start():
  145. return self.fetch_document_start()
  146. # Is it the document end?
  147. if ch == '.' and self.check_document_end():
  148. return self.fetch_document_end()
  149. # TODO: support for BOM within a stream.
  150. #if ch == '\uFEFF':
  151. # return self.fetch_bom() <-- issue BOMToken
  152. # Note: the order of the following checks is NOT significant.
  153. # Is it the flow sequence start indicator?
  154. if ch == '[':
  155. return self.fetch_flow_sequence_start()
  156. # Is it the flow mapping start indicator?
  157. if ch == '{':
  158. return self.fetch_flow_mapping_start()
  159. # Is it the flow sequence end indicator?
  160. if ch == ']':
  161. return self.fetch_flow_sequence_end()
  162. # Is it the flow mapping end indicator?
  163. if ch == '}':
  164. return self.fetch_flow_mapping_end()
  165. # Is it the flow entry indicator?
  166. if ch == ',':
  167. return self.fetch_flow_entry()
  168. # Is it the block entry indicator?
  169. if ch == '-' and self.check_block_entry():
  170. return self.fetch_block_entry()
  171. # Is it the key indicator?
  172. if ch == '?' and self.check_key():
  173. return self.fetch_key()
  174. # Is it the value indicator?
  175. if ch == ':' and self.check_value():
  176. return self.fetch_value()
  177. # Is it an alias?
  178. if ch == '*':
  179. return self.fetch_alias()
  180. # Is it an anchor?
  181. if ch == '&':
  182. return self.fetch_anchor()
  183. # Is it a tag?
  184. if ch == '!':
  185. return self.fetch_tag()
  186. # Is it a literal scalar?
  187. if ch == '|' and not self.flow_level:
  188. return self.fetch_literal()
  189. # Is it a folded scalar?
  190. if ch == '>' and not self.flow_level:
  191. return self.fetch_folded()
  192. # Is it a single quoted scalar?
  193. if ch == '\'':
  194. return self.fetch_single()
  195. # Is it a double quoted scalar?
  196. if ch == '\"':
  197. return self.fetch_double()
  198. # It must be a plain scalar then.
  199. if self.check_plain():
  200. return self.fetch_plain()
  201. # No? It's an error. Let's produce a nice error message.
  202. raise ScannerError("while scanning for the next token", None,
  203. "found character %r that cannot start any token" % ch,
  204. self.get_mark())
  205. # Simple keys treatment.
  206. def next_possible_simple_key(self):
  207. # Return the number of the nearest possible simple key. Actually we
  208. # don't need to loop through the whole dictionary. We may replace it
  209. # with the following code:
  210. # if not self.possible_simple_keys:
  211. # return None
  212. # return self.possible_simple_keys[
  213. # min(self.possible_simple_keys.keys())].token_number
  214. min_token_number = None
  215. for level in self.possible_simple_keys:
  216. key = self.possible_simple_keys[level]
  217. if min_token_number is None or key.token_number < min_token_number:
  218. min_token_number = key.token_number
  219. return min_token_number
  220. def stale_possible_simple_keys(self):
  221. # Remove entries that are no longer possible simple keys. According to
  222. # the YAML specification, simple keys
  223. # - should be limited to a single line,
  224. # - should be no longer than 1024 characters.
  225. # Disabling this procedure will allow simple keys of any length and
  226. # height (may cause problems if indentation is broken though).
  227. for level in list(self.possible_simple_keys):
  228. key = self.possible_simple_keys[level]
  229. if key.line != self.line \
  230. or self.index-key.index > 1024:
  231. if key.required:
  232. raise ScannerError("while scanning a simple key", key.mark,
  233. "could not found expected ':'", self.get_mark())
  234. del self.possible_simple_keys[level]
  235. def save_possible_simple_key(self):
  236. # The next token may start a simple key. We check if it's possible
  237. # and save its position. This function is called for
  238. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  239. # Check if a simple key is required at the current position.
  240. required = not self.flow_level and self.indent == self.column
  241. # A simple key is required only if it is the first token in the current
  242. # line. Therefore it is always allowed.
  243. assert self.allow_simple_key or not required
  244. # The next token might be a simple key. Let's save it's number and
  245. # position.
  246. if self.allow_simple_key:
  247. self.remove_possible_simple_key()
  248. token_number = self.tokens_taken+len(self.tokens)
  249. key = SimpleKey(token_number, required,
  250. self.index, self.line, self.column, self.get_mark())
  251. self.possible_simple_keys[self.flow_level] = key
  252. def remove_possible_simple_key(self):
  253. # Remove the saved possible key position at the current flow level.
  254. if self.flow_level in self.possible_simple_keys:
  255. key = self.possible_simple_keys[self.flow_level]
  256. if key.required:
  257. raise ScannerError("while scanning a simple key", key.mark,
  258. "could not found expected ':'", self.get_mark())
  259. del self.possible_simple_keys[self.flow_level]
  260. # Indentation functions.
  261. def unwind_indent(self, column):
  262. ## In flow context, tokens should respect indentation.
  263. ## Actually the condition should be `self.indent >= column` according to
  264. ## the spec. But this condition will prohibit intuitively correct
  265. ## constructions such as
  266. ## key : {
  267. ## }
  268. #if self.flow_level and self.indent > column:
  269. # raise ScannerError(None, None,
  270. # "invalid intendation or unclosed '[' or '{'",
  271. # self.get_mark())
  272. # In the flow context, indentation is ignored. We make the scanner less
  273. # restrictive then specification requires.
  274. if self.flow_level:
  275. return
  276. # In block context, we may need to issue the BLOCK-END tokens.
  277. while self.indent > column:
  278. mark = self.get_mark()
  279. self.indent = self.indents.pop()
  280. self.tokens.append(BlockEndToken(mark, mark))
  281. def add_indent(self, column):
  282. # Check if we need to increase indentation.
  283. if self.indent < column:
  284. self.indents.append(self.indent)
  285. self.indent = column
  286. return True
  287. return False
  288. # Fetchers.
  289. def fetch_stream_start(self):
  290. # We always add STREAM-START as the first token and STREAM-END as the
  291. # last token.
  292. # Read the token.
  293. mark = self.get_mark()
  294. # Add STREAM-START.
  295. self.tokens.append(StreamStartToken(mark, mark,
  296. encoding=self.encoding))
  297. def fetch_stream_end(self):
  298. # Set the current intendation to -1.
  299. self.unwind_indent(-1)
  300. # Reset simple keys.
  301. self.remove_possible_simple_key()
  302. self.allow_simple_key = False
  303. self.possible_simple_keys = {}
  304. # Read the token.
  305. mark = self.get_mark()
  306. # Add STREAM-END.
  307. self.tokens.append(StreamEndToken(mark, mark))
  308. # The steam is finished.
  309. self.done = True
  310. def fetch_directive(self):
  311. # Set the current intendation to -1.
  312. self.unwind_indent(-1)
  313. # Reset simple keys.
  314. self.remove_possible_simple_key()
  315. self.allow_simple_key = False
  316. # Scan and add DIRECTIVE.
  317. self.tokens.append(self.scan_directive())
  318. def fetch_document_start(self):
  319. self.fetch_document_indicator(DocumentStartToken)
  320. def fetch_document_end(self):
  321. self.fetch_document_indicator(DocumentEndToken)
  322. def fetch_document_indicator(self, TokenClass):
  323. # Set the current intendation to -1.
  324. self.unwind_indent(-1)
  325. # Reset simple keys. Note that there could not be a block collection
  326. # after '---'.
  327. self.remove_possible_simple_key()
  328. self.allow_simple_key = False
  329. # Add DOCUMENT-START or DOCUMENT-END.
  330. start_mark = self.get_mark()
  331. self.forward(3)
  332. end_mark = self.get_mark()
  333. self.tokens.append(TokenClass(start_mark, end_mark))
  334. def fetch_flow_sequence_start(self):
  335. self.fetch_flow_collection_start(FlowSequenceStartToken)
  336. def fetch_flow_mapping_start(self):
  337. self.fetch_flow_collection_start(FlowMappingStartToken)
  338. def fetch_flow_collection_start(self, TokenClass):
  339. # '[' and '{' may start a simple key.
  340. self.save_possible_simple_key()
  341. # Increase the flow level.
  342. self.flow_level += 1
  343. # Simple keys are allowed after '[' and '{'.
  344. self.allow_simple_key = True
  345. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  346. start_mark = self.get_mark()
  347. self.forward()
  348. end_mark = self.get_mark()
  349. self.tokens.append(TokenClass(start_mark, end_mark))
  350. def fetch_flow_sequence_end(self):
  351. self.fetch_flow_collection_end(FlowSequenceEndToken)
  352. def fetch_flow_mapping_end(self):
  353. self.fetch_flow_collection_end(FlowMappingEndToken)
  354. def fetch_flow_collection_end(self, TokenClass):
  355. # Reset possible simple key on the current level.
  356. self.remove_possible_simple_key()
  357. # Decrease the flow level.
  358. self.flow_level -= 1
  359. # No simple keys after ']' or '}'.
  360. self.allow_simple_key = False
  361. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  362. start_mark = self.get_mark()
  363. self.forward()
  364. end_mark = self.get_mark()
  365. self.tokens.append(TokenClass(start_mark, end_mark))
  366. def fetch_flow_entry(self):
  367. # Simple keys are allowed after ','.
  368. self.allow_simple_key = True
  369. # Reset possible simple key on the current level.
  370. self.remove_possible_simple_key()
  371. # Add FLOW-ENTRY.
  372. start_mark = self.get_mark()
  373. self.forward()
  374. end_mark = self.get_mark()
  375. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  376. def fetch_block_entry(self):
  377. # Block context needs additional checks.
  378. if not self.flow_level:
  379. # Are we allowed to start a new entry?
  380. if not self.allow_simple_key:
  381. raise ScannerError(None, None,
  382. "sequence entries are not allowed here",
  383. self.get_mark())
  384. # We may need to add BLOCK-SEQUENCE-START.
  385. if self.add_indent(self.column):
  386. mark = self.get_mark()
  387. self.tokens.append(BlockSequenceStartToken(mark, mark))
  388. # It's an error for the block entry to occur in the flow context,
  389. # but we let the parser detect this.
  390. else:
  391. pass
  392. # Simple keys are allowed after '-'.
  393. self.allow_simple_key = True
  394. # Reset possible simple key on the current level.
  395. self.remove_possible_simple_key()
  396. # Add BLOCK-ENTRY.
  397. start_mark = self.get_mark()
  398. self.forward()
  399. end_mark = self.get_mark()
  400. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  401. def fetch_key(self):
  402. # Block context needs additional checks.
  403. if not self.flow_level:
  404. # Are we allowed to start a key (not nessesary a simple)?
  405. if not self.allow_simple_key:
  406. raise ScannerError(None, None,
  407. "mapping keys are not allowed here",
  408. self.get_mark())
  409. # We may need to add BLOCK-MAPPING-START.
  410. if self.add_indent(self.column):
  411. mark = self.get_mark()
  412. self.tokens.append(BlockMappingStartToken(mark, mark))
  413. # Simple keys are allowed after '?' in the block context.
  414. self.allow_simple_key = not self.flow_level
  415. # Reset possible simple key on the current level.
  416. self.remove_possible_simple_key()
  417. # Add KEY.
  418. start_mark = self.get_mark()
  419. self.forward()
  420. end_mark = self.get_mark()
  421. self.tokens.append(KeyToken(start_mark, end_mark))
  422. def fetch_value(self):
  423. # Do we determine a simple key?
  424. if self.flow_level in self.possible_simple_keys:
  425. # Add KEY.
  426. key = self.possible_simple_keys[self.flow_level]
  427. del self.possible_simple_keys[self.flow_level]
  428. self.tokens.insert(key.token_number-self.tokens_taken,
  429. KeyToken(key.mark, key.mark))
  430. # If this key starts a new block mapping, we need to add
  431. # BLOCK-MAPPING-START.
  432. if not self.flow_level:
  433. if self.add_indent(key.column):
  434. self.tokens.insert(key.token_number-self.tokens_taken,
  435. BlockMappingStartToken(key.mark, key.mark))
  436. # There cannot be two simple keys one after another.
  437. self.allow_simple_key = False
  438. # It must be a part of a complex key.
  439. else:
  440. # Block context needs additional checks.
  441. # (Do we really need them? They will be catched by the parser
  442. # anyway.)
  443. if not self.flow_level:
  444. # We are allowed to start a complex value if and only if
  445. # we can start a simple key.
  446. if not self.allow_simple_key:
  447. raise ScannerError(None, None,
  448. "mapping values are not allowed here",
  449. self.get_mark())
  450. # If this value starts a new block mapping, we need to add
  451. # BLOCK-MAPPING-START. It will be detected as an error later by
  452. # the parser.
  453. if not self.flow_level:
  454. if self.add_indent(self.column):
  455. mark = self.get_mark()
  456. self.tokens.append(BlockMappingStartToken(mark, mark))
  457. # Simple keys are allowed after ':' in the block context.
  458. self.allow_simple_key = not self.flow_level
  459. # Reset possible simple key on the current level.
  460. self.remove_possible_simple_key()
  461. # Add VALUE.
  462. start_mark = self.get_mark()
  463. self.forward()
  464. end_mark = self.get_mark()
  465. self.tokens.append(ValueToken(start_mark, end_mark))
  466. def fetch_alias(self):
  467. # ALIAS could be a simple key.
  468. self.save_possible_simple_key()
  469. # No simple keys after ALIAS.
  470. self.allow_simple_key = False
  471. # Scan and add ALIAS.
  472. self.tokens.append(self.scan_anchor(AliasToken))
  473. def fetch_anchor(self):
  474. # ANCHOR could start a simple key.
  475. self.save_possible_simple_key()
  476. # No simple keys after ANCHOR.
  477. self.allow_simple_key = False
  478. # Scan and add ANCHOR.
  479. self.tokens.append(self.scan_anchor(AnchorToken))
  480. def fetch_tag(self):
  481. # TAG could start a simple key.
  482. self.save_possible_simple_key()
  483. # No simple keys after TAG.
  484. self.allow_simple_key = False
  485. # Scan and add TAG.
  486. self.tokens.append(self.scan_tag())
  487. def fetch_literal(self):
  488. self.fetch_block_scalar(style='|')
  489. def fetch_folded(self):
  490. self.fetch_block_scalar(style='>')
  491. def fetch_block_scalar(self, style):
  492. # A simple key may follow a block scalar.
  493. self.allow_simple_key = True
  494. # Reset possible simple key on the current level.
  495. self.remove_possible_simple_key()
  496. # Scan and add SCALAR.
  497. self.tokens.append(self.scan_block_scalar(style))
  498. def fetch_single(self):
  499. self.fetch_flow_scalar(style='\'')
  500. def fetch_double(self):
  501. self.fetch_flow_scalar(style='"')
  502. def fetch_flow_scalar(self, style):
  503. # A flow scalar could be a simple key.
  504. self.save_possible_simple_key()
  505. # No simple keys after flow scalars.
  506. self.allow_simple_key = False
  507. # Scan and add SCALAR.
  508. self.tokens.append(self.scan_flow_scalar(style))
  509. def fetch_plain(self):
  510. # A plain scalar could be a simple key.
  511. self.save_possible_simple_key()
  512. # No simple keys after plain scalars. But note that `scan_plain` will
  513. # change this flag if the scan is finished at the beginning of the
  514. # line.
  515. self.allow_simple_key = False
  516. # Scan and add SCALAR. May change `allow_simple_key`.
  517. self.tokens.append(self.scan_plain())
  518. # Checkers.
  519. def check_directive(self):
  520. # DIRECTIVE: ^ '%' ...
  521. # The '%' indicator is already checked.
  522. if self.column == 0:
  523. return True
  524. def check_document_start(self):
  525. # DOCUMENT-START: ^ '---' (' '|'\n')
  526. if self.column == 0:
  527. if self.prefix(3) == '---' \
  528. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  529. return True
  530. def check_document_end(self):
  531. # DOCUMENT-END: ^ '...' (' '|'\n')
  532. if self.column == 0:
  533. if self.prefix(3) == '...' \
  534. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  535. return True
  536. def check_block_entry(self):
  537. # BLOCK-ENTRY: '-' (' '|'\n')
  538. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  539. def check_key(self):
  540. # KEY(flow context): '?'
  541. if self.flow_level:
  542. return True
  543. # KEY(block context): '?' (' '|'\n')
  544. else:
  545. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  546. def check_value(self):
  547. # VALUE(flow context): ':'
  548. if self.flow_level:
  549. return True
  550. # VALUE(block context): ':' (' '|'\n')
  551. else:
  552. return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029'
  553. def check_plain(self):
  554. # A plain scalar may start with any non-space character except:
  555. # '-', '?', ':', ',', '[', ']', '{', '}',
  556. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  557. # '%', '@', '`'.
  558. #
  559. # It may also start with
  560. # '-', '?', ':'
  561. # if it is followed by a non-space character.
  562. #
  563. # Note that we limit the last rule to the block context (except the
  564. # '-' character) because we want the flow context to be space
  565. # independent.
  566. ch = self.peek()
  567. return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
  568. or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029'
  569. and (ch == '-' or (not self.flow_level and ch in '?:')))
  570. # Scanners.
  571. def scan_to_next_token(self):
  572. # We ignore spaces, line breaks and comments.
  573. # If we find a line break in the block context, we set the flag
  574. # `allow_simple_key` on.
  575. # The byte order mark is stripped if it's the first character in the
  576. # stream. We do not yet support BOM inside the stream as the
  577. # specification requires. Any such mark will be considered as a part
  578. # of the document.
  579. #
  580. # TODO: We need to make tab handling rules more sane. A good rule is
  581. # Tabs cannot precede tokens
  582. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  583. # KEY(block), VALUE(block), BLOCK-ENTRY
  584. # So the checking code is
  585. # if <TAB>:
  586. # self.allow_simple_keys = False
  587. # We also need to add the check for `allow_simple_keys == True` to
  588. # `unwind_indent` before issuing BLOCK-END.
  589. # Scanners for block, flow, and plain scalars need to be modified.
  590. if self.index == 0 and self.peek() == '\uFEFF':
  591. self.forward()
  592. found = False
  593. while not found:
  594. while self.peek() == ' ':
  595. self.forward()
  596. if self.peek() == '#':
  597. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  598. self.forward()
  599. if self.scan_line_break():
  600. if not self.flow_level:
  601. self.allow_simple_key = True
  602. else:
  603. found = True
  604. def scan_directive(self):
  605. # See the specification for details.
  606. start_mark = self.get_mark()
  607. self.forward()
  608. name = self.scan_directive_name(start_mark)
  609. value = None
  610. if name == 'YAML':
  611. value = self.scan_yaml_directive_value(start_mark)
  612. end_mark = self.get_mark()
  613. elif name == 'TAG':
  614. value = self.scan_tag_directive_value(start_mark)
  615. end_mark = self.get_mark()
  616. else:
  617. end_mark = self.get_mark()
  618. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  619. self.forward()
  620. self.scan_directive_ignored_line(start_mark)
  621. return DirectiveToken(name, value, start_mark, end_mark)
  622. def scan_directive_name(self, start_mark):
  623. # See the specification for details.
  624. length = 0
  625. ch = self.peek(length)
  626. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  627. or ch in '-_':
  628. length += 1
  629. ch = self.peek(length)
  630. if not length:
  631. raise ScannerError("while scanning a directive", start_mark,
  632. "expected alphabetic or numeric character, but found %r"
  633. % ch, self.get_mark())
  634. value = self.prefix(length)
  635. self.forward(length)
  636. ch = self.peek()
  637. if ch not in '\0 \r\n\x85\u2028\u2029':
  638. raise ScannerError("while scanning a directive", start_mark,
  639. "expected alphabetic or numeric character, but found %r"
  640. % ch, self.get_mark())
  641. return value
  642. def scan_yaml_directive_value(self, start_mark):
  643. # See the specification for details.
  644. while self.peek() == ' ':
  645. self.forward()
  646. major = self.scan_yaml_directive_number(start_mark)
  647. if self.peek() != '.':
  648. raise ScannerError("while scanning a directive", start_mark,
  649. "expected a digit or '.', but found %r" % self.peek(),
  650. self.get_mark())
  651. self.forward()
  652. minor = self.scan_yaml_directive_number(start_mark)
  653. if self.peek() not in '\0 \r\n\x85\u2028\u2029':
  654. raise ScannerError("while scanning a directive", start_mark,
  655. "expected a digit or ' ', but found %r" % self.peek(),
  656. self.get_mark())
  657. return (major, minor)
  658. def scan_yaml_directive_number(self, start_mark):
  659. # See the specification for details.
  660. ch = self.peek()
  661. if not ('0' <= ch <= '9'):
  662. raise ScannerError("while scanning a directive", start_mark,
  663. "expected a digit, but found %r" % ch, self.get_mark())
  664. length = 0
  665. while '0' <= self.peek(length) <= '9':
  666. length += 1
  667. value = int(self.prefix(length))
  668. self.forward(length)
  669. return value
  670. def scan_tag_directive_value(self, start_mark):
  671. # See the specification for details.
  672. while self.peek() == ' ':
  673. self.forward()
  674. handle = self.scan_tag_directive_handle(start_mark)
  675. while self.peek() == ' ':
  676. self.forward()
  677. prefix = self.scan_tag_directive_prefix(start_mark)
  678. return (handle, prefix)
  679. def scan_tag_directive_handle(self, start_mark):
  680. # See the specification for details.
  681. value = self.scan_tag_handle('directive', start_mark)
  682. ch = self.peek()
  683. if ch != ' ':
  684. raise ScannerError("while scanning a directive", start_mark,
  685. "expected ' ', but found %r" % ch, self.get_mark())
  686. return value
  687. def scan_tag_directive_prefix(self, start_mark):
  688. # See the specification for details.
  689. value = self.scan_tag_uri('directive', start_mark)
  690. ch = self.peek()
  691. if ch not in '\0 \r\n\x85\u2028\u2029':
  692. raise ScannerError("while scanning a directive", start_mark,
  693. "expected ' ', but found %r" % ch, self.get_mark())
  694. return value
  695. def scan_directive_ignored_line(self, start_mark):
  696. # See the specification for details.
  697. while self.peek() == ' ':
  698. self.forward()
  699. if self.peek() == '#':
  700. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  701. self.forward()
  702. ch = self.peek()
  703. if ch not in '\0\r\n\x85\u2028\u2029':
  704. raise ScannerError("while scanning a directive", start_mark,
  705. "expected a comment or a line break, but found %r"
  706. % ch, self.get_mark())
  707. self.scan_line_break()
  708. def scan_anchor(self, TokenClass):
  709. # The specification does not restrict characters for anchors and
  710. # aliases. This may lead to problems, for instance, the document:
  711. # [ *alias, value ]
  712. # can be interpteted in two ways, as
  713. # [ "value" ]
  714. # and
  715. # [ *alias , "value" ]
  716. # Therefore we restrict aliases to numbers and ASCII letters.
  717. start_mark = self.get_mark()
  718. indicator = self.peek()
  719. if indicator == '*':
  720. name = 'alias'
  721. else:
  722. name = 'anchor'
  723. self.forward()
  724. length = 0
  725. ch = self.peek(length)
  726. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  727. or ch in '-_':
  728. length += 1
  729. ch = self.peek(length)
  730. if not length:
  731. raise ScannerError("while scanning an %s" % name, start_mark,
  732. "expected alphabetic or numeric character, but found %r"
  733. % ch, self.get_mark())
  734. value = self.prefix(length)
  735. self.forward(length)
  736. ch = self.peek()
  737. if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
  738. raise ScannerError("while scanning an %s" % name, start_mark,
  739. "expected alphabetic or numeric character, but found %r"
  740. % ch, self.get_mark())
  741. end_mark = self.get_mark()
  742. return TokenClass(value, start_mark, end_mark)
  743. def scan_tag(self):
  744. # See the specification for details.
  745. start_mark = self.get_mark()
  746. ch = self.peek(1)
  747. if ch == '<':
  748. handle = None
  749. self.forward(2)
  750. suffix = self.scan_tag_uri('tag', start_mark)
  751. if self.peek() != '>':
  752. raise ScannerError("while parsing a tag", start_mark,
  753. "expected '>', but found %r" % self.peek(),
  754. self.get_mark())
  755. self.forward()
  756. elif ch in '\0 \t\r\n\x85\u2028\u2029':
  757. handle = None
  758. suffix = '!'
  759. self.forward()
  760. else:
  761. length = 1
  762. use_handle = False
  763. while ch not in '\0 \r\n\x85\u2028\u2029':
  764. if ch == '!':
  765. use_handle = True
  766. break
  767. length += 1
  768. ch = self.peek(length)
  769. handle = '!'
  770. if use_handle:
  771. handle = self.scan_tag_handle('tag', start_mark)
  772. else:
  773. handle = '!'
  774. self.forward()
  775. suffix = self.scan_tag_uri('tag', start_mark)
  776. ch = self.peek()
  777. if ch not in '\0 \r\n\x85\u2028\u2029':
  778. raise ScannerError("while scanning a tag", start_mark,
  779. "expected ' ', but found %r" % ch, self.get_mark())
  780. value = (handle, suffix)
  781. end_mark = self.get_mark()
  782. return TagToken(value, start_mark, end_mark)
  783. def scan_block_scalar(self, style):
  784. # See the specification for details.
  785. if style == '>':
  786. folded = True
  787. else:
  788. folded = False
  789. chunks = []
  790. start_mark = self.get_mark()
  791. # Scan the header.
  792. self.forward()
  793. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  794. self.scan_block_scalar_ignored_line(start_mark)
  795. # Determine the indentation level and go to the first non-empty line.
  796. min_indent = self.indent+1
  797. if min_indent < 1:
  798. min_indent = 1
  799. if increment is None:
  800. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  801. indent = max(min_indent, max_indent)
  802. else:
  803. indent = min_indent+increment-1
  804. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  805. line_break = ''
  806. # Scan the inner part of the block scalar.
  807. while self.column == indent and self.peek() != '\0':
  808. chunks.extend(breaks)
  809. leading_non_space = self.peek() not in ' \t'
  810. length = 0
  811. while self.peek(length) not in '\0\r\n\x85\u2028\u2029':
  812. length += 1
  813. chunks.append(self.prefix(length))
  814. self.forward(length)
  815. line_break = self.scan_line_break()
  816. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  817. if self.column == indent and self.peek() != '\0':
  818. # Unfortunately, folding rules are ambiguous.
  819. #
  820. # This is the folding according to the specification:
  821. if folded and line_break == '\n' \
  822. and leading_non_space and self.peek() not in ' \t':
  823. if not breaks:
  824. chunks.append(' ')
  825. else:
  826. chunks.append(line_break)
  827. # This is Clark Evans's interpretation (also in the spec
  828. # examples):
  829. #
  830. #if folded and line_break == '\n':
  831. # if not breaks:
  832. # if self.peek() not in ' \t':
  833. # chunks.append(' ')
  834. # else:
  835. # chunks.append(line_break)
  836. #else:
  837. # chunks.append(line_break)
  838. else:
  839. break
  840. # Chomp the tail.
  841. if chomping is not False:
  842. chunks.append(line_break)
  843. if chomping is True:
  844. chunks.extend(breaks)
  845. # We are done.
  846. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  847. style)
  848. def scan_block_scalar_indicators(self, start_mark):
  849. # See the specification for details.
  850. chomping = None
  851. increment = None
  852. ch = self.peek()
  853. if ch in '+-':
  854. if ch == '+':
  855. chomping = True
  856. else:
  857. chomping = False
  858. self.forward()
  859. ch = self.peek()
  860. if ch in '0123456789':
  861. increment = int(ch)
  862. if increment == 0:
  863. raise ScannerError("while scanning a block scalar", start_mark,
  864. "expected indentation indicator in the range 1-9, but found 0",
  865. self.get_mark())
  866. self.forward()
  867. elif ch in '0123456789':
  868. increment = int(ch)
  869. if increment == 0:
  870. raise ScannerError("while scanning a block scalar", start_mark,
  871. "expected indentation indicator in the range 1-9, but found 0",
  872. self.get_mark())
  873. self.forward()
  874. ch = self.peek()
  875. if ch in '+-':
  876. if ch == '+':
  877. chomping = True
  878. else:
  879. chomping = False
  880. self.forward()
  881. ch = self.peek()
  882. if ch not in '\0 \r\n\x85\u2028\u2029':
  883. raise ScannerError("while scanning a block scalar", start_mark,
  884. "expected chomping or indentation indicators, but found %r"
  885. % ch, self.get_mark())
  886. return chomping, increment
  887. def scan_block_scalar_ignored_line(self, start_mark):
  888. # See the specification for details.
  889. while self.peek() == ' ':
  890. self.forward()
  891. if self.peek() == '#':
  892. while self.peek() not in '\0\r\n\x85\u2028\u2029':
  893. self.forward()
  894. ch = self.peek()
  895. if ch not in '\0\r\n\x85\u2028\u2029':
  896. raise ScannerError("while scanning a block scalar", start_mark,
  897. "expected a comment or a line break, but found %r" % ch,
  898. self.get_mark())
  899. self.scan_line_break()
  900. def scan_block_scalar_indentation(self):
  901. # See the specification for details.
  902. chunks = []
  903. max_indent = 0
  904. end_mark = self.get_mark()
  905. while self.peek() in ' \r\n\x85\u2028\u2029':
  906. if self.peek() != ' ':
  907. chunks.append(self.scan_line_break())
  908. end_mark = self.get_mark()
  909. else:
  910. self.forward()
  911. if self.column > max_indent:
  912. max_indent = self.column
  913. return chunks, max_indent, end_mark
  914. def scan_block_scalar_breaks(self, indent):
  915. # See the specification for details.
  916. chunks = []
  917. end_mark = self.get_mark()
  918. while self.column < indent and self.peek() == ' ':
  919. self.forward()
  920. while self.peek() in '\r\n\x85\u2028\u2029':
  921. chunks.append(self.scan_line_break())
  922. end_mark = self.get_mark()
  923. while self.column < indent and self.peek() == ' ':
  924. self.forward()
  925. return chunks, end_mark
  926. def scan_flow_scalar(self, style):
  927. # See the specification for details.
  928. # Note that we loose indentation rules for quoted scalars. Quoted
  929. # scalars don't need to adhere indentation because " and ' clearly
  930. # mark the beginning and the end of them. Therefore we are less
  931. # restrictive then the specification requires. We only need to check
  932. # that document separators are not included in scalars.
  933. if style == '"':
  934. double = True
  935. else:
  936. double = False
  937. chunks = []
  938. start_mark = self.get_mark()
  939. quote = self.peek()
  940. self.forward()
  941. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  942. while self.peek() != quote:
  943. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  944. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  945. self.forward()
  946. end_mark = self.get_mark()
  947. return ScalarToken(''.join(chunks), False, start_mark, end_mark,
  948. style)
  949. ESCAPE_REPLACEMENTS = {
  950. '0': '\0',
  951. 'a': '\x07',
  952. 'b': '\x08',
  953. 't': '\x09',
  954. '\t': '\x09',
  955. 'n': '\x0A',
  956. 'v': '\x0B',
  957. 'f': '\x0C',
  958. 'r': '\x0D',
  959. 'e': '\x1B',
  960. ' ': '\x20',
  961. '\"': '\"',
  962. '\\': '\\',
  963. 'N': '\x85',
  964. '_': '\xA0',
  965. 'L': '\u2028',
  966. 'P': '\u2029',
  967. }
  968. ESCAPE_CODES = {
  969. 'x': 2,
  970. 'u': 4,
  971. 'U': 8,
  972. }
  973. def scan_flow_scalar_non_spaces(self, double, start_mark):
  974. # See the specification for details.
  975. chunks = []
  976. while True:
  977. length = 0
  978. while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029':
  979. length += 1
  980. if length:
  981. chunks.append(self.prefix(length))
  982. self.forward(length)
  983. ch = self.peek()
  984. if not double and ch == '\'' and self.peek(1) == '\'':
  985. chunks.append('\'')
  986. self.forward(2)
  987. elif (double and ch == '\'') or (not double and ch in '\"\\'):
  988. chunks.append(ch)
  989. self.forward()
  990. elif double and ch == '\\':
  991. self.forward()
  992. ch = self.peek()
  993. if ch in self.ESCAPE_REPLACEMENTS:
  994. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  995. self.forward()
  996. elif ch in self.ESCAPE_CODES:
  997. length = self.ESCAPE_CODES[ch]
  998. self.forward()
  999. for k in range(length):
  1000. if self.peek(k) not in '0123456789ABCDEFabcdef':
  1001. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1002. "expected escape sequence of %d hexdecimal numbers, but found %r" %
  1003. (length, self.peek(k)), self.get_mark())
  1004. code = int(self.prefix(length), 16)
  1005. chunks.append(chr(code))
  1006. self.forward(length)
  1007. elif ch in '\r\n\x85\u2028\u2029':
  1008. self.scan_line_break()
  1009. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1010. else:
  1011. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1012. "found unknown escape character %r" % ch, self.get_mark())
  1013. else:
  1014. return chunks
  1015. def scan_flow_scalar_spaces(self, double, start_mark):
  1016. # See the specification for details.
  1017. chunks = []
  1018. length = 0
  1019. while self.peek(length) in ' \t':
  1020. length += 1
  1021. whitespaces = self.prefix(length)
  1022. self.forward(length)
  1023. ch = self.peek()
  1024. if ch == '\0':
  1025. raise ScannerError("while scanning a quoted scalar", start_mark,
  1026. "found unexpected end of stream", self.get_mark())
  1027. elif ch in '\r\n\x85\u2028\u2029':
  1028. line_break = self.scan_line_break()
  1029. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1030. if line_break != '\n':
  1031. chunks.append(line_break)
  1032. elif not breaks:
  1033. chunks.append(' ')
  1034. chunks.extend(breaks)
  1035. else:
  1036. chunks.append(whitespaces)
  1037. return chunks
  1038. def scan_flow_scalar_breaks(self, double, start_mark):
  1039. # See the specification for details.
  1040. chunks = []
  1041. while True:
  1042. # Instead of checking indentation, we check for document
  1043. # separators.
  1044. prefix = self.prefix(3)
  1045. if (prefix == '---' or prefix == '...') \
  1046. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1047. raise ScannerError("while scanning a quoted scalar", start_mark,
  1048. "found unexpected document separator", self.get_mark())
  1049. while self.peek() in ' \t':
  1050. self.forward()
  1051. if self.peek() in '\r\n\x85\u2028\u2029':
  1052. chunks.append(self.scan_line_break())
  1053. else:
  1054. return chunks
  1055. def scan_plain(self):
  1056. # See the specification for details.
  1057. # We add an additional restriction for the flow context:
  1058. # plain scalars in the flow context cannot contain ',', ':' and '?'.
  1059. # We also keep track of the `allow_simple_key` flag here.
  1060. # Indentation rules are loosed for the flow context.
  1061. chunks = []
  1062. start_mark = self.get_mark()
  1063. end_mark = start_mark
  1064. indent = self.indent+1
  1065. # We allow zero indentation for scalars, but then we need to check for
  1066. # document separators at the beginning of the line.
  1067. #if indent == 0:
  1068. # indent = 1
  1069. spaces = []
  1070. while True:
  1071. length = 0
  1072. if self.peek() == '#':
  1073. break
  1074. while True:
  1075. ch = self.peek(length)
  1076. if ch in '\0 \t\r\n\x85\u2028\u2029' \
  1077. or (not self.flow_level and ch == ':' and
  1078. self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029') \
  1079. or (self.flow_level and ch in ',:?[]{}'):
  1080. break
  1081. length += 1
  1082. # It's not clear what we should do with ':' in the flow context.
  1083. if (self.flow_level and ch == ':'
  1084. and self.peek(length+1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}'):
  1085. self.forward(length)
  1086. raise ScannerError("while scanning a plain scalar", start_mark,
  1087. "found unexpected ':'", self.get_mark(),
  1088. "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
  1089. if length == 0:
  1090. break
  1091. self.allow_simple_key = False
  1092. chunks.extend(spaces)
  1093. chunks.append(self.prefix(length))
  1094. self.forward(length)
  1095. end_mark = self.get_mark()
  1096. spaces = self.scan_plain_spaces(indent, start_mark)
  1097. if not spaces or self.peek() == '#' \
  1098. or (not self.flow_level and self.column < indent):
  1099. break
  1100. return ScalarToken(''.join(chunks), True, start_mark, end_mark)
  1101. def scan_plain_spaces(self, indent, start_mark):
  1102. # See the specification for details.
  1103. # The specification is really confusing about tabs in plain scalars.
  1104. # We just forbid them completely. Do not use tabs in YAML!
  1105. chunks = []
  1106. length = 0
  1107. while self.peek(length) in ' ':
  1108. length += 1
  1109. whitespaces = self.prefix(length)
  1110. self.forward(length)
  1111. ch = self.peek()
  1112. if ch in '\r\n\x85\u2028\u2029':
  1113. line_break = self.scan_line_break()
  1114. self.allow_simple_key = True
  1115. prefix = self.prefix(3)
  1116. if (prefix == '---' or prefix == '...') \
  1117. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1118. return
  1119. breaks = []
  1120. while self.peek() in ' \r\n\x85\u2028\u2029':
  1121. if self.peek() == ' ':
  1122. self.forward()
  1123. else:
  1124. breaks.append(self.scan_line_break())
  1125. prefix = self.prefix(3)
  1126. if (prefix == '---' or prefix == '...') \
  1127. and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029':
  1128. return
  1129. if line_break != '\n':
  1130. chunks.append(line_break)
  1131. elif not breaks:
  1132. chunks.append(' ')
  1133. chunks.extend(breaks)
  1134. elif whitespaces:
  1135. chunks.append(whitespaces)
  1136. return chunks
  1137. def scan_tag_handle(self, name, start_mark):
  1138. # See the specification for details.
  1139. # For some strange reasons, the specification does not allow '_' in
  1140. # tag handles. I have allowed it anyway.
  1141. ch = self.peek()
  1142. if ch != '!':
  1143. raise ScannerError("while scanning a %s" % name, start_mark,
  1144. "expected '!', but found %r" % ch, self.get_mark())
  1145. length = 1
  1146. ch = self.peek(length)
  1147. if ch != ' ':
  1148. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1149. or ch in '-_':
  1150. length += 1
  1151. ch = self.peek(length)
  1152. if ch != '!':
  1153. self.forward(length)
  1154. raise ScannerError("while scanning a %s" % name, start_mark,
  1155. "expected '!', but found %r" % ch, self.get_mark())
  1156. length += 1
  1157. value = self.prefix(length)
  1158. self.forward(length)
  1159. return value
  1160. def scan_tag_uri(self, name, start_mark):
  1161. # See the specification for details.
  1162. # Note: we do not check if URI is well-formed.
  1163. chunks = []
  1164. length = 0
  1165. ch = self.peek(length)
  1166. while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \
  1167. or ch in '-;/?:@&=+$,_.!~*\'()[]%':
  1168. if ch == '%':
  1169. chunks.append(self.prefix(length))
  1170. self.forward(length)
  1171. length = 0
  1172. chunks.append(self.scan_uri_escapes(name, start_mark))
  1173. else:
  1174. length += 1
  1175. ch = self.peek(length)
  1176. if length:
  1177. chunks.append(self.prefix(length))
  1178. self.forward(length)
  1179. length = 0
  1180. if not chunks:
  1181. raise ScannerError("while parsing a %s" % name, start_mark,
  1182. "expected URI, but found %r" % ch, self.get_mark())
  1183. return ''.join(chunks)
  1184. def scan_uri_escapes(self, name, start_mark):
  1185. # See the specification for details.
  1186. codes = []
  1187. mark = self.get_mark()
  1188. while self.peek() == '%':
  1189. self.forward()
  1190. for k in range(2):
  1191. if self.peek(k) not in '0123456789ABCDEFabcdef':
  1192. raise ScannerError("while scanning a %s" % name, start_mark,
  1193. "expected URI escape sequence of 2 hexdecimal numbers, but found %r"
  1194. % self.peek(k), self.get_mark())
  1195. codes.append(int(self.prefix(2), 16))
  1196. self.forward(2)
  1197. try:
  1198. value = bytes(codes).decode('utf-8')
  1199. except UnicodeDecodeError as exc:
  1200. raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
  1201. return value
  1202. def scan_line_break(self):
  1203. # Transforms:
  1204. # '\r\n' : '\n'
  1205. # '\r' : '\n'
  1206. # '\n' : '\n'
  1207. # '\x85' : '\n'
  1208. # '\u2028' : '\u2028'
  1209. # '\u2029 : '\u2029'
  1210. # default : ''
  1211. ch = self.peek()
  1212. if ch in '\r\n\x85':
  1213. if self.prefix(2) == '\r\n':
  1214. self.forward(2)
  1215. else:
  1216. self.forward()
  1217. return '\n'
  1218. elif ch in '\u2028\u2029':
  1219. self.forward()
  1220. return ch
  1221. return ''
  1222. #try:
  1223. # import psyco
  1224. # psyco.bind(Scanner)
  1225. #except ImportError:
  1226. # pass