Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
parser.py
Go to the documentation of this file.
1 # (c) Copyright Rosetta Commons Member Institutions.
2 # (c) This file is part of the Rosetta software suite and is made available under license.
3 # (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
4 # (c) For more information, see http://www.rosettacommons.org. Questions about this can be
5 # (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
6 
7 # The following YAML grammar is LL(1) and is parsed by a recursive descent
8 # parser.
9 #
10 # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
11 # implicit_document ::= block_node DOCUMENT-END*
12 # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
13 # block_node_or_indentless_sequence ::=
14 # ALIAS
15 # | properties (block_content | indentless_block_sequence)?
16 # | block_content
17 # | indentless_block_sequence
18 # block_node ::= ALIAS
19 # | properties block_content?
20 # | block_content
21 # flow_node ::= ALIAS
22 # | properties flow_content?
23 # | flow_content
24 # properties ::= TAG ANCHOR? | ANCHOR TAG?
25 # block_content ::= block_collection | flow_collection | SCALAR
26 # flow_content ::= flow_collection | SCALAR
27 # block_collection ::= block_sequence | block_mapping
28 # flow_collection ::= flow_sequence | flow_mapping
29 # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
30 # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
31 # block_mapping ::= BLOCK-MAPPING_START
32 # ((KEY block_node_or_indentless_sequence?)?
33 # (VALUE block_node_or_indentless_sequence?)?)*
34 # BLOCK-END
35 # flow_sequence ::= FLOW-SEQUENCE-START
36 # (flow_sequence_entry FLOW-ENTRY)*
37 # flow_sequence_entry?
38 # FLOW-SEQUENCE-END
39 # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
40 # flow_mapping ::= FLOW-MAPPING-START
41 # (flow_mapping_entry FLOW-ENTRY)*
42 # flow_mapping_entry?
43 # FLOW-MAPPING-END
44 # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
45 #
46 # FIRST sets:
47 #
48 # stream: { STREAM-START }
49 # explicit_document: { DIRECTIVE DOCUMENT-START }
50 # implicit_document: FIRST(block_node)
51 # block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
52 # flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
53 # block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
54 # flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
55 # block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
56 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
57 # block_sequence: { BLOCK-SEQUENCE-START }
58 # block_mapping: { BLOCK-MAPPING-START }
59 # block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
60 # indentless_sequence: { ENTRY }
61 # flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
62 # flow_sequence: { FLOW-SEQUENCE-START }
63 # flow_mapping: { FLOW-MAPPING-START }
64 # flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
65 # flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
66 
67 __all__ = ['Parser', 'ParserError']
68 
69 from error import MarkedYAMLError
70 from tokens import *
71 from events import *
72 from scanner import *
73 
74 class ParserError(MarkedYAMLError):
75  pass
76 
77 class Parser(object):
78  # Since writing a recursive-descendant parser is a straightforward task, we
79  # do not give many comments here.
80  # Note that we use Python generators. If you rewrite the parser in another
81  # language, you may replace all 'yield'-s with event handler calls.
82 
83  DEFAULT_TAGS = {
84  u'!': u'!',
85  u'!!': u'tag:yaml.org,2002:',
86  }
87 
88  def __init__(self):
89  self.current_event = None
90  self.yaml_version = None
91  self.tag_handles = {}
92  self.states = []
93  self.marks = []
95 
96  def check_event(self, *choices):
97  # Check the type of the next event.
98  if self.current_event is None:
99  if self.state:
100  self.current_event = self.state()
101  if self.current_event is not None:
102  if not choices:
103  return True
104  for choice in choices:
105  if isinstance(self.current_event, choice):
106  return True
107  return False
108 
109  def peek_event(self):
110  # Get the next event.
111  if self.current_event is None:
112  if self.state:
113  self.current_event = self.state()
114  return self.current_event
115 
116  def get_event(self):
117  # Get the next event and proceed further.
118  if self.current_event is None:
119  if self.state:
120  self.current_event = self.state()
121  value = self.current_event
122  self.current_event = None
123  return value
124 
125  # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
126  # implicit_document ::= block_node DOCUMENT-END*
127  # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
128 
130 
131  # Parse the stream start.
132  token = self.get_token()
133  event = StreamStartEvent(token.start_mark, token.end_mark,
134  encoding=token.encoding)
135 
136  # Prepare the next state.
138 
139  return event
140 
142 
143  # Parse an implicit document.
144  if not self.check_token(DirectiveToken, DocumentStartToken,
145  StreamEndToken):
146  self.tag_handles = self.DEFAULT_TAGS
147  token = self.peek_token()
148  start_mark = end_mark = token.start_mark
149  event = DocumentStartEvent(start_mark, end_mark,
150  explicit=False)
151 
152  # Prepare the next state.
153  self.states.append(self.parse_document_end)
154  self.state = self.parse_block_node
155 
156  return event
157 
158  else:
159  return self.parse_document_start()
160 
162 
163  # Parse any extra document end indicators.
164  while self.check_token(DocumentEndToken):
165  self.get_token()
166 
167  # Parse an explicit document.
168  if not self.check_token(StreamEndToken):
169  token = self.peek_token()
170  start_mark = token.start_mark
171  version, tags = self.process_directives()
172  if not self.check_token(DocumentStartToken):
173  raise ParserError(None, None,
174  "expected '<document start>', but found %r"
175  % self.peek_token().id,
176  self.peek_token().start_mark)
177  token = self.get_token()
178  end_mark = token.end_mark
179  event = DocumentStartEvent(start_mark, end_mark,
180  explicit=True, version=version, tags=tags)
181  self.states.append(self.parse_document_end)
182  self.state = self.parse_document_content
183  else:
184  # Parse the end of the stream.
185  token = self.get_token()
186  event = StreamEndEvent(token.start_mark, token.end_mark)
187  assert not self.states
188  assert not self.marks
189  self.state = None
190  return event
191 
193 
194  # Parse the document end.
195  token = self.peek_token()
196  start_mark = end_mark = token.start_mark
197  explicit = False
198  if self.check_token(DocumentEndToken):
199  token = self.get_token()
200  end_mark = token.end_mark
201  explicit = True
202  event = DocumentEndEvent(start_mark, end_mark,
203  explicit=explicit)
204 
205  # Prepare the next state.
206  self.state = self.parse_document_start
207 
208  return event
209 
211  if self.check_token(DirectiveToken,
212  DocumentStartToken, DocumentEndToken, StreamEndToken):
213  event = self.process_empty_scalar(self.peek_token().start_mark)
214  self.state = self.states.pop()
215  return event
216  else:
217  return self.parse_block_node()
218 
220  self.yaml_version = None
221  self.tag_handles = {}
222  while self.check_token(DirectiveToken):
223  token = self.get_token()
224  if token.name == u'YAML':
225  if self.yaml_version is not None:
226  raise ParserError(None, None,
227  "found duplicate YAML directive", token.start_mark)
228  major, minor = token.value
229  if major != 1:
230  raise ParserError(None, None,
231  "found incompatible YAML document (version 1.* is required)",
232  token.start_mark)
233  self.yaml_version = token.value
234  elif token.name == u'TAG':
235  handle, prefix = token.value
236  if handle in self.tag_handles:
237  raise ParserError(None, None,
238  "duplicate tag handle %r" % handle.encode('utf-8'),
239  token.start_mark)
240  self.tag_handles[handle] = prefix
241  if self.tag_handles:
242  value = self.yaml_version, self.tag_handles.copy()
243  else:
244  value = self.yaml_version, None
245  for key in self.DEFAULT_TAGS:
246  if key not in self.tag_handles:
247  self.tag_handles[key] = self.DEFAULT_TAGS[key]
248  return value
249 
250  # block_node_or_indentless_sequence ::= ALIAS
251  # | properties (block_content | indentless_block_sequence)?
252  # | block_content
253  # | indentless_block_sequence
254  # block_node ::= ALIAS
255  # | properties block_content?
256  # | block_content
257  # flow_node ::= ALIAS
258  # | properties flow_content?
259  # | flow_content
260  # properties ::= TAG ANCHOR? | ANCHOR TAG?
261  # block_content ::= block_collection | flow_collection | SCALAR
262  # flow_content ::= flow_collection | SCALAR
263  # block_collection ::= block_sequence | block_mapping
264  # flow_collection ::= flow_sequence | flow_mapping
265 
266  def parse_block_node(self):
267  return self.parse_node(block=True)
268 
269  def parse_flow_node(self):
270  return self.parse_node()
271 
273  return self.parse_node(block=True, indentless_sequence=True)
274 
275  def parse_node(self, block=False, indentless_sequence=False):
276  if self.check_token(AliasToken):
277  token = self.get_token()
278  event = AliasEvent(token.value, token.start_mark, token.end_mark)
279  self.state = self.states.pop()
280  else:
281  anchor = None
282  tag = None
283  start_mark = end_mark = tag_mark = None
284  if self.check_token(AnchorToken):
285  token = self.get_token()
286  start_mark = token.start_mark
287  end_mark = token.end_mark
288  anchor = token.value
289  if self.check_token(TagToken):
290  token = self.get_token()
291  tag_mark = token.start_mark
292  end_mark = token.end_mark
293  tag = token.value
294  elif self.check_token(TagToken):
295  token = self.get_token()
296  start_mark = tag_mark = token.start_mark
297  end_mark = token.end_mark
298  tag = token.value
299  if self.check_token(AnchorToken):
300  token = self.get_token()
301  end_mark = token.end_mark
302  anchor = token.value
303  if tag is not None:
304  handle, suffix = tag
305  if handle is not None:
306  if handle not in self.tag_handles:
307  raise ParserError("while parsing a node", start_mark,
308  "found undefined tag handle %r" % handle.encode('utf-8'),
309  tag_mark)
310  tag = self.tag_handles[handle]+suffix
311  else:
312  tag = suffix
313  #if tag == u'!':
314  # raise ParserError("while parsing a node", start_mark,
315  # "found non-specific tag '!'", tag_mark,
316  # "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
317  if start_mark is None:
318  start_mark = end_mark = self.peek_token().start_mark
319  event = None
320  implicit = (tag is None or tag == u'!')
321  if indentless_sequence and self.check_token(BlockEntryToken):
322  end_mark = self.peek_token().end_mark
323  event = SequenceStartEvent(anchor, tag, implicit,
324  start_mark, end_mark)
326  else:
327  if self.check_token(ScalarToken):
328  token = self.get_token()
329  end_mark = token.end_mark
330  if (token.plain and tag is None) or tag == u'!':
331  implicit = (True, False)
332  elif tag is None:
333  implicit = (False, True)
334  else:
335  implicit = (False, False)
336  event = ScalarEvent(anchor, tag, implicit, token.value,
337  start_mark, end_mark, style=token.style)
338  self.state = self.states.pop()
339  elif self.check_token(FlowSequenceStartToken):
340  end_mark = self.peek_token().end_mark
341  event = SequenceStartEvent(anchor, tag, implicit,
342  start_mark, end_mark, flow_style=True)
344  elif self.check_token(FlowMappingStartToken):
345  end_mark = self.peek_token().end_mark
346  event = MappingStartEvent(anchor, tag, implicit,
347  start_mark, end_mark, flow_style=True)
349  elif block and self.check_token(BlockSequenceStartToken):
350  end_mark = self.peek_token().start_mark
351  event = SequenceStartEvent(anchor, tag, implicit,
352  start_mark, end_mark, flow_style=False)
354  elif block and self.check_token(BlockMappingStartToken):
355  end_mark = self.peek_token().start_mark
356  event = MappingStartEvent(anchor, tag, implicit,
357  start_mark, end_mark, flow_style=False)
359  elif anchor is not None or tag is not None:
360  # Empty scalars are allowed even if a tag or an anchor is
361  # specified.
362  event = ScalarEvent(anchor, tag, (implicit, False), u'',
363  start_mark, end_mark)
364  self.state = self.states.pop()
365  else:
366  if block:
367  node = 'block'
368  else:
369  node = 'flow'
370  token = self.peek_token()
371  raise ParserError("while parsing a %s node" % node, start_mark,
372  "expected the node content, but found %r" % token.id,
373  token.start_mark)
374  return event
375 
376  # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
377 
379  token = self.get_token()
380  self.marks.append(token.start_mark)
381  return self.parse_block_sequence_entry()
382 
384  if self.check_token(BlockEntryToken):
385  token = self.get_token()
386  if not self.check_token(BlockEntryToken, BlockEndToken):
387  self.states.append(self.parse_block_sequence_entry)
388  return self.parse_block_node()
389  else:
391  return self.process_empty_scalar(token.end_mark)
392  if not self.check_token(BlockEndToken):
393  token = self.peek_token()
394  raise ParserError("while parsing a block collection", self.marks[-1],
395  "expected <block end>, but found %r" % token.id, token.start_mark)
396  token = self.get_token()
397  event = SequenceEndEvent(token.start_mark, token.end_mark)
398  self.state = self.states.pop()
399  self.marks.pop()
400  return event
401 
402  # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
403 
405  if self.check_token(BlockEntryToken):
406  token = self.get_token()
407  if not self.check_token(BlockEntryToken,
408  KeyToken, ValueToken, BlockEndToken):
409  self.states.append(self.parse_indentless_sequence_entry)
410  return self.parse_block_node()
411  else:
413  return self.process_empty_scalar(token.end_mark)
414  token = self.peek_token()
415  event = SequenceEndEvent(token.start_mark, token.start_mark)
416  self.state = self.states.pop()
417  return event
418 
419  # block_mapping ::= BLOCK-MAPPING_START
420  # ((KEY block_node_or_indentless_sequence?)?
421  # (VALUE block_node_or_indentless_sequence?)?)*
422  # BLOCK-END
423 
425  token = self.get_token()
426  self.marks.append(token.start_mark)
427  return self.parse_block_mapping_key()
428 
430  if self.check_token(KeyToken):
431  token = self.get_token()
432  if not self.check_token(KeyToken, ValueToken, BlockEndToken):
433  self.states.append(self.parse_block_mapping_value)
435  else:
436  self.state = self.parse_block_mapping_value
437  return self.process_empty_scalar(token.end_mark)
438  if not self.check_token(BlockEndToken):
439  token = self.peek_token()
440  raise ParserError("while parsing a block mapping", self.marks[-1],
441  "expected <block end>, but found %r" % token.id, token.start_mark)
442  token = self.get_token()
443  event = MappingEndEvent(token.start_mark, token.end_mark)
444  self.state = self.states.pop()
445  self.marks.pop()
446  return event
447 
449  if self.check_token(ValueToken):
450  token = self.get_token()
451  if not self.check_token(KeyToken, ValueToken, BlockEndToken):
452  self.states.append(self.parse_block_mapping_key)
454  else:
455  self.state = self.parse_block_mapping_key
456  return self.process_empty_scalar(token.end_mark)
457  else:
458  self.state = self.parse_block_mapping_key
459  token = self.peek_token()
460  return self.process_empty_scalar(token.start_mark)
461 
462  # flow_sequence ::= FLOW-SEQUENCE-START
463  # (flow_sequence_entry FLOW-ENTRY)*
464  # flow_sequence_entry?
465  # FLOW-SEQUENCE-END
466  # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
467  #
468  # Note that while production rules for both flow_sequence_entry and
469  # flow_mapping_entry are equal, their interpretations are different.
470  # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
471  # generate an inline mapping (set syntax).
472 
474  token = self.get_token()
475  self.marks.append(token.start_mark)
476  return self.parse_flow_sequence_entry(first=True)
477 
478  def parse_flow_sequence_entry(self, first=False):
479  if not self.check_token(FlowSequenceEndToken):
480  if not first:
481  if self.check_token(FlowEntryToken):
482  self.get_token()
483  else:
484  token = self.peek_token()
485  raise ParserError("while parsing a flow sequence", self.marks[-1],
486  "expected ',' or ']', but got %r" % token.id, token.start_mark)
487 
488  if self.check_token(KeyToken):
489  token = self.peek_token()
490  event = MappingStartEvent(None, None, True,
491  token.start_mark, token.end_mark,
492  flow_style=True)
494  return event
495  elif not self.check_token(FlowSequenceEndToken):
496  self.states.append(self.parse_flow_sequence_entry)
497  return self.parse_flow_node()
498  token = self.get_token()
499  event = SequenceEndEvent(token.start_mark, token.end_mark)
500  self.state = self.states.pop()
501  self.marks.pop()
502  return event
503 
505  token = self.get_token()
506  if not self.check_token(ValueToken,
507  FlowEntryToken, FlowSequenceEndToken):
508  self.states.append(self.parse_flow_sequence_entry_mapping_value)
509  return self.parse_flow_node()
510  else:
512  return self.process_empty_scalar(token.end_mark)
513 
515  if self.check_token(ValueToken):
516  token = self.get_token()
517  if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
518  self.states.append(self.parse_flow_sequence_entry_mapping_end)
519  return self.parse_flow_node()
520  else:
522  return self.process_empty_scalar(token.end_mark)
523  else:
525  token = self.peek_token()
526  return self.process_empty_scalar(token.start_mark)
527 
529  self.state = self.parse_flow_sequence_entry
530  token = self.peek_token()
531  return MappingEndEvent(token.start_mark, token.start_mark)
532 
533  # flow_mapping ::= FLOW-MAPPING-START
534  # (flow_mapping_entry FLOW-ENTRY)*
535  # flow_mapping_entry?
536  # FLOW-MAPPING-END
537  # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
538 
540  token = self.get_token()
541  self.marks.append(token.start_mark)
542  return self.parse_flow_mapping_key(first=True)
543 
544  def parse_flow_mapping_key(self, first=False):
545  if not self.check_token(FlowMappingEndToken):
546  if not first:
547  if self.check_token(FlowEntryToken):
548  self.get_token()
549  else:
550  token = self.peek_token()
551  raise ParserError("while parsing a flow mapping", self.marks[-1],
552  "expected ',' or '}', but got %r" % token.id, token.start_mark)
553  if self.check_token(KeyToken):
554  token = self.get_token()
555  if not self.check_token(ValueToken,
556  FlowEntryToken, FlowMappingEndToken):
557  self.states.append(self.parse_flow_mapping_value)
558  return self.parse_flow_node()
559  else:
560  self.state = self.parse_flow_mapping_value
561  return self.process_empty_scalar(token.end_mark)
562  elif not self.check_token(FlowMappingEndToken):
563  self.states.append(self.parse_flow_mapping_empty_value)
564  return self.parse_flow_node()
565  token = self.get_token()
566  event = MappingEndEvent(token.start_mark, token.end_mark)
567  self.state = self.states.pop()
568  self.marks.pop()
569  return event
570 
572  if self.check_token(ValueToken):
573  token = self.get_token()
574  if not self.check_token(FlowEntryToken, FlowMappingEndToken):
575  self.states.append(self.parse_flow_mapping_key)
576  return self.parse_flow_node()
577  else:
578  self.state = self.parse_flow_mapping_key
579  return self.process_empty_scalar(token.end_mark)
580  else:
581  self.state = self.parse_flow_mapping_key
582  token = self.peek_token()
583  return self.process_empty_scalar(token.start_mark)
584 
586  self.state = self.parse_flow_mapping_key
587  return self.process_empty_scalar(self.peek_token().start_mark)
588 
589  def process_empty_scalar(self, mark):
590  return ScalarEvent(None, None, (True, False), u'', mark, mark)
591 
def process_directives
Definition: parser.py:219
def process_empty_scalar
Definition: parser.py:589
def parse_stream_start
Definition: parser.py:129
def parse_indentless_sequence_entry
Definition: parser.py:404
def parse_flow_mapping_key
Definition: parser.py:544
def parse_implicit_document_start
Definition: parser.py:141
def parse_document_content
Definition: parser.py:210
def parse_flow_mapping_value
Definition: parser.py:571
def parse_flow_mapping_first_key
Definition: parser.py:539
def parse_flow_mapping_empty_value
Definition: parser.py:585
def parse_flow_sequence_entry_mapping_key
Definition: parser.py:504
def parse_block_node
Definition: parser.py:266
def parse_block_node_or_indentless_sequence
Definition: parser.py:272
def parse_block_sequence_first_entry
Definition: parser.py:378
def parse_block_mapping_value
Definition: parser.py:448
def parse_document_end
Definition: parser.py:192
def parse_flow_sequence_entry_mapping_value
Definition: parser.py:514
def parse_flow_sequence_entry_mapping_end
Definition: parser.py:528
def parse_document_start
Definition: parser.py:161
def parse_block_mapping_first_key
Definition: parser.py:424
def parse_flow_sequence_first_entry
Definition: parser.py:473
def parse_block_sequence_entry
Definition: parser.py:383
def parse_flow_sequence_entry
Definition: parser.py:478
def parse_block_mapping_key
Definition: parser.py:429
dictionary DEFAULT_TAGS
Definition: parser.py:83