yaml.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. """
  2. yaml.py
  3. Lexer for YAML, a human-friendly data serialization language
  4. (http://yaml.org/).
  5. Written by Kirill Simonov <xi@resolvent.net>.
  6. License: Whatever suitable for inclusion into the Pygments package.
  7. """
  8. from pygments.lexer import \
  9. ExtendedRegexLexer, LexerContext, include, bygroups
  10. from pygments.token import \
  11. Text, Comment, Punctuation, Name, Literal
  12. __all__ = ['YAMLLexer']
  13. class YAMLLexerContext(LexerContext):
  14. """Indentation context for the YAML lexer."""
  15. def __init__(self, *args, **kwds):
  16. super(YAMLLexerContext, self).__init__(*args, **kwds)
  17. self.indent_stack = []
  18. self.indent = -1
  19. self.next_indent = 0
  20. self.block_scalar_indent = None
  21. def something(TokenClass):
  22. """Do not produce empty tokens."""
  23. def callback(lexer, match, context):
  24. text = match.group()
  25. if not text:
  26. return
  27. yield match.start(), TokenClass, text
  28. context.pos = match.end()
  29. return callback
  30. def reset_indent(TokenClass):
  31. """Reset the indentation levels."""
  32. def callback(lexer, match, context):
  33. text = match.group()
  34. context.indent_stack = []
  35. context.indent = -1
  36. context.next_indent = 0
  37. context.block_scalar_indent = None
  38. yield match.start(), TokenClass, text
  39. context.pos = match.end()
  40. return callback
  41. def save_indent(TokenClass, start=False):
  42. """Save a possible indentation level."""
  43. def callback(lexer, match, context):
  44. text = match.group()
  45. extra = ''
  46. if start:
  47. context.next_indent = len(text)
  48. if context.next_indent < context.indent:
  49. while context.next_indent < context.indent:
  50. context.indent = context.indent_stack.pop()
  51. if context.next_indent > context.indent:
  52. extra = text[context.indent:]
  53. text = text[:context.indent]
  54. else:
  55. context.next_indent += len(text)
  56. if text:
  57. yield match.start(), TokenClass, text
  58. if extra:
  59. yield match.start()+len(text), TokenClass.Error, extra
  60. context.pos = match.end()
  61. return callback
  62. def set_indent(TokenClass, implicit=False):
  63. """Set the previously saved indentation level."""
  64. def callback(lexer, match, context):
  65. text = match.group()
  66. if context.indent < context.next_indent:
  67. context.indent_stack.append(context.indent)
  68. context.indent = context.next_indent
  69. if not implicit:
  70. context.next_indent += len(text)
  71. yield match.start(), TokenClass, text
  72. context.pos = match.end()
  73. return callback
  74. def set_block_scalar_indent(TokenClass):
  75. """Set an explicit indentation level for a block scalar."""
  76. def callback(lexer, match, context):
  77. text = match.group()
  78. context.block_scalar_indent = None
  79. if not text:
  80. return
  81. increment = match.group(1)
  82. if increment:
  83. current_indent = max(context.indent, 0)
  84. increment = int(increment)
  85. context.block_scalar_indent = current_indent + increment
  86. if text:
  87. yield match.start(), TokenClass, text
  88. context.pos = match.end()
  89. return callback
  90. def parse_block_scalar_empty_line(IndentTokenClass, ContentTokenClass):
  91. """Process an empty line in a block scalar."""
  92. def callback(lexer, match, context):
  93. text = match.group()
  94. if (context.block_scalar_indent is None or
  95. len(text) <= context.block_scalar_indent):
  96. if text:
  97. yield match.start(), IndentTokenClass, text
  98. else:
  99. indentation = text[:context.block_scalar_indent]
  100. content = text[context.block_scalar_indent:]
  101. yield match.start(), IndentTokenClass, indentation
  102. yield (match.start()+context.block_scalar_indent,
  103. ContentTokenClass, content)
  104. context.pos = match.end()
  105. return callback
  106. def parse_block_scalar_indent(TokenClass):
  107. """Process indentation spaces in a block scalar."""
  108. def callback(lexer, match, context):
  109. text = match.group()
  110. if context.block_scalar_indent is None:
  111. if len(text) <= max(context.indent, 0):
  112. context.stack.pop()
  113. context.stack.pop()
  114. return
  115. context.block_scalar_indent = len(text)
  116. else:
  117. if len(text) < context.block_scalar_indent:
  118. context.stack.pop()
  119. context.stack.pop()
  120. return
  121. if text:
  122. yield match.start(), TokenClass, text
  123. context.pos = match.end()
  124. return callback
  125. def parse_plain_scalar_indent(TokenClass):
  126. """Process indentation spaces in a plain scalar."""
  127. def callback(lexer, match, context):
  128. text = match.group()
  129. if len(text) <= context.indent:
  130. context.stack.pop()
  131. context.stack.pop()
  132. return
  133. if text:
  134. yield match.start(), TokenClass, text
  135. context.pos = match.end()
  136. return callback
  137. class YAMLLexer(ExtendedRegexLexer):
  138. """Lexer for the YAML language."""
  139. name = 'YAML'
  140. aliases = ['yaml']
  141. filenames = ['*.yaml', '*.yml']
  142. mimetypes = ['text/x-yaml']
  143. tokens = {
  144. # the root rules
  145. 'root': [
  146. # ignored whitespaces
  147. (r'[ ]+(?=#|$)', Text.Blank),
  148. # line breaks
  149. (r'\n+', Text.Break),
  150. # a comment
  151. (r'#[^\n]*', Comment.Single),
  152. # the '%YAML' directive
  153. (r'^%YAML(?=[ ]|$)', reset_indent(Name.Directive),
  154. 'yaml-directive'),
  155. # the %TAG directive
  156. (r'^%TAG(?=[ ]|$)', reset_indent(Name.Directive),
  157. 'tag-directive'),
  158. # document start and document end indicators
  159. (r'^(?:---|\.\.\.)(?=[ ]|$)',
  160. reset_indent(Punctuation.Document), 'block-line'),
  161. # indentation spaces
  162. (r'[ ]*(?![ \t\n\r\f\v]|$)',
  163. save_indent(Text.Indent, start=True),
  164. ('block-line', 'indentation')),
  165. ],
  166. # trailing whitespaces after directives or a block scalar indicator
  167. 'ignored-line': [
  168. # ignored whitespaces
  169. (r'[ ]+(?=#|$)', Text.Blank),
  170. # a comment
  171. (r'#[^\n]*', Comment.Single),
  172. # line break
  173. (r'\n', Text.Break, '#pop:2'),
  174. ],
  175. # the %YAML directive
  176. 'yaml-directive': [
  177. # the version number
  178. (r'([ ]+)([0-9]+\.[0-9]+)',
  179. bygroups(Text.Blank, Literal.Version), 'ignored-line'),
  180. ],
  181. # the %YAG directive
  182. 'tag-directive': [
  183. # a tag handle and the corresponding prefix
  184. (r'([ ]+)(!|![0-9A-Za-z_-]*!)'
  185. r'([ ]+)(!|!?[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)',
  186. bygroups(Text.Blank, Name.Type, Text.Blank, Name.Type),
  187. 'ignored-line'),
  188. ],
  189. # block scalar indicators and indentation spaces
  190. 'indentation': [
  191. # trailing whitespaces are ignored
  192. (r'[ ]*$', something(Text.Blank), '#pop:2'),
  193. # whitespaces preceeding block collection indicators
  194. (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text.Indent)),
  195. # block collection indicators
  196. (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)),
  197. # the beginning a block line
  198. (r'[ ]*', save_indent(Text.Indent), '#pop'),
  199. ],
  200. # an indented line in the block context
  201. 'block-line': [
  202. # the line end
  203. (r'[ ]*(?=#|$)', something(Text.Blank), '#pop'),
  204. # whitespaces separating tokens
  205. (r'[ ]+', Text.Blank),
  206. # tags, anchors and aliases,
  207. include('descriptors'),
  208. # block collections and scalars
  209. include('block-nodes'),
  210. # flow collections and quoted scalars
  211. include('flow-nodes'),
  212. # a plain scalar
  213. (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`-]|[?:-][^ \t\n\r\f\v])',
  214. something(Literal.Scalar.Plain),
  215. 'plain-scalar-in-block-context'),
  216. ],
  217. # tags, anchors, aliases
  218. 'descriptors' : [
  219. # a full-form tag
  220. (r'!<[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+>', Name.Type),
  221. # a tag in the form '!', '!suffix' or '!handle!suffix'
  222. (r'!(?:[0-9A-Za-z_-]+)?'
  223. r'(?:![0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)?', Name.Type),
  224. # an anchor
  225. (r'&[0-9A-Za-z_-]+', Name.Anchor),
  226. # an alias
  227. (r'\*[0-9A-Za-z_-]+', Name.Alias),
  228. ],
  229. # block collections and scalars
  230. 'block-nodes': [
  231. # implicit key
  232. (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)),
  233. # literal and folded scalars
  234. (r'[|>]', Punctuation.Indicator,
  235. ('block-scalar-content', 'block-scalar-header')),
  236. ],
  237. # flow collections and quoted scalars
  238. 'flow-nodes': [
  239. # a flow sequence
  240. (r'\[', Punctuation.Indicator, 'flow-sequence'),
  241. # a flow mapping
  242. (r'\{', Punctuation.Indicator, 'flow-mapping'),
  243. # a single-quoted scalar
  244. (r'\'', Literal.Scalar.Flow.Quote, 'single-quoted-scalar'),
  245. # a double-quoted scalar
  246. (r'\"', Literal.Scalar.Flow.Quote, 'double-quoted-scalar'),
  247. ],
  248. # the content of a flow collection
  249. 'flow-collection': [
  250. # whitespaces
  251. (r'[ ]+', Text.Blank),
  252. # line breaks
  253. (r'\n+', Text.Break),
  254. # a comment
  255. (r'#[^\n]*', Comment.Single),
  256. # simple indicators
  257. (r'[?:,]', Punctuation.Indicator),
  258. # tags, anchors and aliases
  259. include('descriptors'),
  260. # nested collections and quoted scalars
  261. include('flow-nodes'),
  262. # a plain scalar
  263. (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`])',
  264. something(Literal.Scalar.Plain),
  265. 'plain-scalar-in-flow-context'),
  266. ],
  267. # a flow sequence indicated by '[' and ']'
  268. 'flow-sequence': [
  269. # include flow collection rules
  270. include('flow-collection'),
  271. # the closing indicator
  272. (r'\]', Punctuation.Indicator, '#pop'),
  273. ],
  274. # a flow mapping indicated by '{' and '}'
  275. 'flow-mapping': [
  276. # include flow collection rules
  277. include('flow-collection'),
  278. # the closing indicator
  279. (r'\}', Punctuation.Indicator, '#pop'),
  280. ],
  281. # block scalar lines
  282. 'block-scalar-content': [
  283. # line break
  284. (r'\n', Text.Break),
  285. # empty line
  286. (r'^[ ]+$',
  287. parse_block_scalar_empty_line(Text.Indent,
  288. Literal.Scalar.Block)),
  289. # indentation spaces (we may leave the state here)
  290. (r'^[ ]*', parse_block_scalar_indent(Text.Indent)),
  291. # line content
  292. (r'[^\n\r\f\v]+', Literal.Scalar.Block),
  293. ],
  294. # the content of a literal or folded scalar
  295. 'block-scalar-header': [
  296. # indentation indicator followed by chomping flag
  297. (r'([1-9])?[+-]?(?=[ ]|$)',
  298. set_block_scalar_indent(Punctuation.Indicator),
  299. 'ignored-line'),
  300. # chomping flag followed by indentation indicator
  301. (r'[+-]?([1-9])?(?=[ ]|$)',
  302. set_block_scalar_indent(Punctuation.Indicator),
  303. 'ignored-line'),
  304. ],
  305. # ignored and regular whitespaces in quoted scalars
  306. 'quoted-scalar-whitespaces': [
  307. # leading and trailing whitespaces are ignored
  308. (r'^[ ]+|[ ]+$', Text.Blank),
  309. # line breaks are ignored
  310. (r'\n+', Text.Break),
  311. # other whitespaces are a part of the value
  312. (r'[ ]+', Literal.Scalar.Flow),
  313. ],
  314. # single-quoted scalars
  315. 'single-quoted-scalar': [
  316. # include whitespace and line break rules
  317. include('quoted-scalar-whitespaces'),
  318. # escaping of the quote character
  319. (r'\'\'', Literal.Scalar.Flow.Escape),
  320. # regular non-whitespace characters
  321. (r'[^ \t\n\r\f\v\']+', Literal.Scalar.Flow),
  322. # the closing quote
  323. (r'\'', Literal.Scalar.Flow.Quote, '#pop'),
  324. ],
  325. # double-quoted scalars
  326. 'double-quoted-scalar': [
  327. # include whitespace and line break rules
  328. include('quoted-scalar-whitespaces'),
  329. # escaping of special characters
  330. (r'\\[0abt\tn\nvfre "\\N_LP]', Literal.Scalar.Flow.Escape),
  331. # escape codes
  332. (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})',
  333. Literal.Scalar.Flow.Escape),
  334. # regular non-whitespace characters
  335. (r'[^ \t\n\r\f\v\"\\]+', Literal.Scalar.Flow),
  336. # the closing quote
  337. (r'"', Literal.Scalar.Flow.Quote, '#pop'),
  338. ],
  339. # the beginning of a new line while scanning a plain scalar
  340. 'plain-scalar-in-block-context-new-line': [
  341. # empty lines
  342. (r'^[ ]+$', Text.Blank),
  343. # line breaks
  344. (r'\n+', Text.Break),
  345. # document start and document end indicators
  346. (r'^(?=---|\.\.\.)', something(Punctuation.Document), '#pop:3'),
  347. # indentation spaces (we may leave the block line state here)
  348. (r'^[ ]*', parse_plain_scalar_indent(Text.Indent), '#pop'),
  349. ],
  350. # a plain scalar in the block context
  351. 'plain-scalar-in-block-context': [
  352. # the scalar ends with the ':' indicator
  353. (r'[ ]*(?=:[ ]|:$)', something(Text.Blank), '#pop'),
  354. # the scalar ends with whitespaces followed by a comment
  355. (r'[ ]+(?=#)', Text.Blank, '#pop'),
  356. # trailing whitespaces are ignored
  357. (r'[ ]+$', Text.Blank),
  358. # line breaks are ignored
  359. (r'\n+', Text.Break, 'plain-scalar-in-block-context-new-line'),
  360. # other whitespaces are a part of the value
  361. (r'[ ]+', Literal.Scalar.Plain),
  362. # regular non-whitespace characters
  363. (r'(?::(?![ \t\n\r\f\v])|[^ \t\n\r\f\v:])+',
  364. Literal.Scalar.Plain),
  365. ],
  366. # a plain scalar is the flow context
  367. 'plain-scalar-in-flow-context': [
  368. # the scalar ends with an indicator character
  369. (r'[ ]*(?=[,:?\[\]{}])', something(Text.Blank), '#pop'),
  370. # the scalar ends with a comment
  371. (r'[ ]+(?=#)', Text.Blank, '#pop'),
  372. # leading and trailing whitespaces are ignored
  373. (r'^[ ]+|[ ]+$', Text.Blank),
  374. # line breaks are ignored
  375. (r'\n+', Text.Break),
  376. # other whitespaces are a part of the value
  377. (r'[ ]+', Literal.Scalar.Plain),
  378. # regular non-whitespace characters
  379. (r'[^ \t\n\r\f\v,:?\[\]{}]+', Literal.Scalar.Plain),
  380. ],
  381. }
  382. def get_tokens_unprocessed(self, text=None, context=None):
  383. if context is None:
  384. context = YAMLLexerContext(text, 0)
  385. return super(YAMLLexer, self).get_tokens_unprocessed(text, context)