32 __all__ = [
'Scanner',
'ScannerError']
34 from error
import MarkedYAMLError
43 def __init__(self, token_number, required, index, line, column, mark):
54 """Initialize the scanner."""
125 for choice
in choices:
126 if isinstance(self.
tokens[0], choice):
143 return self.tokens.pop(0)
260 raise ScannerError(
"while scanning for the next token",
None,
261 "found character %r that cannot start any token"
262 % ch.encode(
'utf-8'), self.get_mark())
274 min_token_number =
None
277 if min_token_number
is None or key.token_number < min_token_number:
278 min_token_number = key.token_number
279 return min_token_number
288 for level
in self.possible_simple_keys.keys():
290 if key.line != self.line \
291 or self.
index-key.index > 1024:
293 raise ScannerError(
"while scanning a simple key", key.mark,
294 "could not found expected ':'", self.get_mark())
315 self.
index, self.line, self.
column, self.get_mark())
324 raise ScannerError(
"while scanning a simple key", key.mark,
325 "could not found expected ':'", self.get_mark())
350 while self.
indent > column:
351 mark = self.get_mark()
352 self.
indent = self.indents.pop()
358 self.indents.append(self.
indent)
370 mark = self.get_mark()
374 encoding=self.encoding))
387 mark = self.get_mark()
424 start_mark = self.get_mark()
426 end_mark = self.get_mark()
427 self.tokens.append(TokenClass(start_mark, end_mark))
447 start_mark = self.get_mark()
449 end_mark = self.get_mark()
450 self.tokens.append(TokenClass(start_mark, end_mark))
470 start_mark = self.get_mark()
472 end_mark = self.get_mark()
473 self.tokens.append(TokenClass(start_mark, end_mark))
484 start_mark = self.get_mark()
486 end_mark = self.get_mark()
497 "sequence entries are not allowed here",
502 mark = self.get_mark()
517 start_mark = self.get_mark()
519 end_mark = self.get_mark()
530 "mapping keys are not allowed here",
535 mark = self.get_mark()
545 start_mark = self.get_mark()
547 end_mark = self.get_mark()
548 self.tokens.append(
KeyToken(start_mark, end_mark))
583 "mapping values are not allowed here",
591 mark = self.get_mark()
601 start_mark = self.get_mark()
603 end_mark = self.get_mark()
604 self.tokens.append(
ValueToken(start_mark, end_mark))
684 self.tokens.append(self.scan_plain())
699 if self.prefix(3) ==
u'---' \
700 and self.peek(3)
in u'\0 \t\r\n\x85\u2028\u2029':
707 if self.prefix(3) ==
u'...' \
708 and self.peek(3)
in u'\0 \t\r\n\x85\u2028\u2029':
714 return self.peek(1)
in u'\0 \t\r\n\x85\u2028\u2029'
724 return self.peek(1)
in u'\0 \t\r\n\x85\u2028\u2029'
734 return self.peek(1)
in u'\0 \t\r\n\x85\u2028\u2029'
751 return ch
not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
752 or (self.peek(1)
not in u'\0 \t\r\n\x85\u2028\u2029'
753 and (ch ==
u'-' or (
not self.
flow_level and ch
in u'?:')))
777 if self.
index == 0
and self.peek() ==
u'\uFEFF':
781 while self.peek() ==
u' ':
783 if self.peek() ==
u'#':
784 while self.peek()
not in u'\0\r\n\x85\u2028\u2029':
786 if self.scan_line_break():
794 start_mark = self.get_mark()
800 end_mark = self.get_mark()
803 end_mark = self.get_mark()
805 end_mark = self.get_mark()
806 while self.peek()
not in u'\0\r\n\x85\u2028\u2029':
814 ch = self.peek(length)
815 while u'0' <= ch <=
u'9' or u'A' <= ch <=
'Z' or u'a' <= ch <=
'z' \
818 ch = self.peek(length)
820 raise ScannerError(
"while scanning a directive", start_mark,
821 "expected alphabetic or numeric character, but found %r"
822 % ch.encode(
'utf-8'), self.get_mark())
823 value = self.prefix(length)
826 if ch
not in u'\0 \r\n\x85\u2028\u2029':
827 raise ScannerError(
"while scanning a directive", start_mark,
828 "expected alphabetic or numeric character, but found %r"
829 % ch.encode(
'utf-8'), self.get_mark())
834 while self.peek() ==
u' ':
837 if self.peek() !=
'.':
838 raise ScannerError(
"while scanning a directive", start_mark,
839 "expected a digit or '.', but found %r"
840 % self.peek().encode(
'utf-8'),
844 if self.peek()
not in u'\0 \r\n\x85\u2028\u2029':
845 raise ScannerError(
"while scanning a directive", start_mark,
846 "expected a digit or ' ', but found %r"
847 % self.peek().encode(
'utf-8'),
849 return (major, minor)
854 if not (
u'0' <= ch <=
'9'):
855 raise ScannerError(
"while scanning a directive", start_mark,
856 "expected a digit, but found %r" % ch.encode(
'utf-8'),
859 while u'0' <= self.peek(length) <=
u'9':
861 value =
int(self.prefix(length))
867 while self.peek() ==
u' ':
870 while self.peek() ==
u' ':
873 return (handle, prefix)
877 value = self.scan_tag_handle(
'directive', start_mark)
880 raise ScannerError(
"while scanning a directive", start_mark,
881 "expected ' ', but found %r" % ch.encode(
'utf-8'),
887 value = self.scan_tag_uri(
'directive', start_mark)
889 if ch
not in u'\0 \r\n\x85\u2028\u2029':
890 raise ScannerError(
"while scanning a directive", start_mark,
891 "expected ' ', but found %r" % ch.encode(
'utf-8'),
897 while self.peek() ==
u' ':
899 if self.peek() ==
u'#':
900 while self.peek()
not in u'\0\r\n\x85\u2028\u2029':
903 if ch
not in u'\0\r\n\x85\u2028\u2029':
904 raise ScannerError(
"while scanning a directive", start_mark,
905 "expected a comment or a line break, but found %r"
906 % ch.encode(
'utf-8'), self.get_mark())
907 self.scan_line_break()
918 start_mark = self.get_mark()
919 indicator = self.peek()
926 ch = self.peek(length)
927 while u'0' <= ch <=
u'9' or u'A' <= ch <=
'Z' or u'a' <= ch <=
'z' \
930 ch = self.peek(length)
932 raise ScannerError(
"while scanning an %s" % name, start_mark,
933 "expected alphabetic or numeric character, but found %r"
934 % ch.encode(
'utf-8'), self.get_mark())
935 value = self.prefix(length)
938 if ch
not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
939 raise ScannerError(
"while scanning an %s" % name, start_mark,
940 "expected alphabetic or numeric character, but found %r"
941 % ch.encode(
'utf-8'), self.get_mark())
942 end_mark = self.get_mark()
943 return TokenClass(value, start_mark, end_mark)
947 start_mark = self.get_mark()
952 suffix = self.scan_tag_uri(
'tag', start_mark)
953 if self.peek() !=
u'>':
955 "expected '>', but found %r" % self.peek().encode(
'utf-8'),
958 elif ch
in u'\0 \t\r\n\x85\u2028\u2029':
965 while ch
not in u'\0 \r\n\x85\u2028\u2029':
970 ch = self.peek(length)
973 handle = self.scan_tag_handle(
'tag', start_mark)
977 suffix = self.scan_tag_uri(
'tag', start_mark)
979 if ch
not in u'\0 \r\n\x85\u2028\u2029':
981 "expected ' ', but found %r" % ch.encode(
'utf-8'),
983 value = (handle, suffix)
984 end_mark = self.get_mark()
985 return TagToken(value, start_mark, end_mark)
996 start_mark = self.get_mark()
1004 min_indent = self.
indent+1
1007 if increment
is None:
1009 indent =
max(min_indent, max_indent)
1011 indent = min_indent+increment-1
1016 while self.
column == indent
and self.peek() !=
u'\0':
1017 chunks.extend(breaks)
1018 leading_non_space = self.peek()
not in u' \t'
1020 while self.peek(length)
not in u'\0\r\n\x85\u2028\u2029':
1022 chunks.append(self.prefix(length))
1023 self.forward(length)
1024 line_break = self.scan_line_break()
1026 if self.
column == indent
and self.peek() !=
u'\0':
1032 if folded
and line_break ==
u'\n' \
1033 and leading_non_space
and self.peek()
not in u' \t':
1037 chunks.append(line_break)
1054 if chomping
is not False:
1055 chunks.append(line_break)
1056 if chomping
is True:
1057 chunks.extend(breaks)
1075 if ch
in u'0123456789':
1078 raise ScannerError(
"while scanning a block scalar", start_mark,
1079 "expected indentation indicator in the range 1-9, but found 0",
1082 elif ch
in u'0123456789':
1085 raise ScannerError(
"while scanning a block scalar", start_mark,
1086 "expected indentation indicator in the range 1-9, but found 0",
1097 if ch
not in u'\0 \r\n\x85\u2028\u2029':
1098 raise ScannerError(
"while scanning a block scalar", start_mark,
1099 "expected chomping or indentation indicators, but found %r"
1100 % ch.encode(
'utf-8'), self.get_mark())
1101 return chomping, increment
1105 while self.peek() ==
u' ':
1107 if self.peek() ==
u'#':
1108 while self.peek()
not in u'\0\r\n\x85\u2028\u2029':
1111 if ch
not in u'\0\r\n\x85\u2028\u2029':
1112 raise ScannerError(
"while scanning a block scalar", start_mark,
1113 "expected a comment or a line break, but found %r"
1114 % ch.encode(
'utf-8'), self.get_mark())
1115 self.scan_line_break()
1121 end_mark = self.get_mark()
1122 while self.peek()
in u' \r\n\x85\u2028\u2029':
1123 if self.peek() !=
u' ':
1124 chunks.append(self.scan_line_break())
1125 end_mark = self.get_mark()
1128 if self.
column > max_indent:
1130 return chunks, max_indent, end_mark
1135 end_mark = self.get_mark()
1136 while self.
column < indent
and self.peek() ==
u' ':
1138 while self.peek()
in u'\r\n\x85\u2028\u2029':
1139 chunks.append(self.scan_line_break())
1140 end_mark = self.get_mark()
1141 while self.
column < indent
and self.peek() ==
u' ':
1143 return chunks, end_mark
1157 start_mark = self.get_mark()
1160 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1161 while self.peek() != quote:
1162 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1163 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
1165 end_mark = self.get_mark()
1169 ESCAPE_REPLACEMENTS = {
1195 def scan_flow_scalar_non_spaces(self, double, start_mark):
1200 while self.peek(length)
not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1203 chunks.append(self.prefix(length))
1204 self.forward(length)
1206 if not double
and ch ==
u'\'' and self.peek(1) ==
u'\'':
1207 chunks.append(
u'\'')
1209 elif (double
and ch ==
u'\'')
or (
not double
and ch
in u'\"\\'):
1212 elif double
and ch ==
u'\\':
1215 if ch
in self.ESCAPE_REPLACEMENTS:
1216 chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1218 elif ch
in self.ESCAPE_CODES:
1219 length = self.ESCAPE_CODES[ch]
1221 for k
in range(length):
1222 if self.peek(k)
not in u'0123456789ABCDEFabcdef':
1223 raise ScannerError(
"while scanning a double-quoted scalar", start_mark,
1224 "expected escape sequence of %d hexdecimal numbers, but found %r" %
1225 (length, self.peek(k).encode(
'utf-8')), self.get_mark())
1226 code =
int(self.prefix(length), 16)
1227 chunks.append(unichr(code))
1228 self.forward(length)
1229 elif ch
in u'\r\n\x85\u2028\u2029':
1230 self.scan_line_break()
1231 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
1233 raise ScannerError(
"while scanning a double-quoted scalar", start_mark,
1234 "found unknown escape character %r" % ch.encode(
'utf-8'), self.get_mark())
1238 def scan_flow_scalar_spaces(self, double, start_mark):
1242 while self.peek(length)
in u' \t':
1244 whitespaces = self.prefix(length)
1245 self.forward(length)
1248 raise ScannerError(
"while scanning a quoted scalar", start_mark,
1249 "found unexpected end of stream", self.get_mark())
1250 elif ch
in u'\r\n\x85\u2028\u2029':
1251 line_break = self.scan_line_break()
1252 breaks = self.scan_flow_scalar_breaks(double, start_mark)
1253 if line_break !=
u'\n':
1254 chunks.append(line_break)
1257 chunks.extend(breaks)
1259 chunks.append(whitespaces)
1262 def scan_flow_scalar_breaks(self, double, start_mark):
1268 prefix = self.prefix(3)
1269 if (prefix ==
u'---' or prefix ==
u'...') \
1270 and self.peek(3)
in u'\0 \t\r\n\x85\u2028\u2029':
1271 raise ScannerError(
"while scanning a quoted scalar", start_mark,
1272 "found unexpected document separator", self.get_mark())
1273 while self.peek()
in u' \t':
1275 if self.peek()
in u'\r\n\x85\u2028\u2029':
1276 chunks.append(self.scan_line_break())
1280 def scan_plain(self):
1287 start_mark = self.get_mark()
1288 end_mark = start_mark
1297 if self.peek() ==
u'#':
1300 ch = self.peek(length)
1301 if ch
in u'\0 \t\r\n\x85\u2028\u2029' \
1303 self.peek(length+1)
in u'\0 \t\r\n\x85\u2028\u2029') \
1309 and self.peek(length+1)
not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'):
1310 self.forward(length)
1311 raise ScannerError(
"while scanning a plain scalar", start_mark,
1312 "found unexpected ':'", self.get_mark(),
1313 "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
1317 chunks.extend(spaces)
1318 chunks.append(self.prefix(length))
1319 self.forward(length)
1320 end_mark = self.get_mark()
1321 spaces = self.scan_plain_spaces(indent, start_mark)
1322 if not spaces
or self.peek() ==
u'#' \
1327 def scan_plain_spaces(self, indent, start_mark):
1333 while self.peek(length)
in u' ':
1335 whitespaces = self.prefix(length)
1336 self.forward(length)
1338 if ch
in u'\r\n\x85\u2028\u2029':
1339 line_break = self.scan_line_break()
1341 prefix = self.prefix(3)
1342 if (prefix ==
u'---' or prefix ==
u'...') \
1343 and self.peek(3)
in u'\0 \t\r\n\x85\u2028\u2029':
1346 while self.peek()
in u' \r\n\x85\u2028\u2029':
1347 if self.peek() ==
' ':
1350 breaks.append(self.scan_line_break())
1351 prefix = self.prefix(3)
1352 if (prefix ==
u'---' or prefix ==
u'...') \
1353 and self.peek(3)
in u'\0 \t\r\n\x85\u2028\u2029':
1355 if line_break !=
u'\n':
1356 chunks.append(line_break)
1359 chunks.extend(breaks)
1361 chunks.append(whitespaces)
1364 def scan_tag_handle(self, name, start_mark):
1370 raise ScannerError(
"while scanning a %s" % name, start_mark,
1371 "expected '!', but found %r" % ch.encode(
'utf-8'),
1374 ch = self.peek(length)
1376 while u'0' <= ch <=
u'9' or u'A' <= ch <=
'Z' or u'a' <= ch <=
'z' \
1379 ch = self.peek(length)
1381 self.forward(length)
1382 raise ScannerError(
"while scanning a %s" % name, start_mark,
1383 "expected '!', but found %r" % ch.encode(
'utf-8'),
1386 value = self.prefix(length)
1387 self.forward(length)
1390 def scan_tag_uri(self, name, start_mark):
1395 ch = self.peek(length)
1396 while u'0' <= ch <=
u'9' or u'A' <= ch <=
'Z' or u'a' <= ch <=
'z' \
1397 or ch
in u'-;/?:@&=+$,_.!~*\'()[]%':
1399 chunks.append(self.prefix(length))
1400 self.forward(length)
1402 chunks.append(self.scan_uri_escapes(name, start_mark))
1405 ch = self.peek(length)
1407 chunks.append(self.prefix(length))
1408 self.forward(length)
1411 raise ScannerError(
"while parsing a %s" % name, start_mark,
1412 "expected URI, but found %r" % ch.encode(
'utf-8'),
1414 return u''.
join(chunks)
1416 def scan_uri_escapes(self, name, start_mark):
1419 mark = self.get_mark()
1420 while self.peek() ==
u'%':
1423 if self.peek(k)
not in u'0123456789ABCDEFabcdef':
1424 raise ScannerError(
"while scanning a %s" % name, start_mark,
1425 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1426 (self.peek(k).encode(
'utf-8')), self.get_mark())
1427 bytes.append(chr(
int(self.prefix(2), 16)))
1431 except UnicodeDecodeError, exc:
1432 raise ScannerError(
"while scanning a %s" % name, start_mark, str(exc), mark)
1435 def scan_line_break(self):
1445 if ch
in u'\r\n\x85':
1446 if self.prefix(2) ==
u'\r\n':
1451 elif ch
in u'\u2028\u2029':
def scan_block_scalar_ignored_line
def scan_yaml_directive_number
def scan_tag_directive_handle
def next_possible_simple_key
def fetch_document_indicator
def fetch_flow_mapping_end
def scan_block_scalar_breaks
Fstring::size_type len(Fstring const &s)
Length.
def remove_possible_simple_key
def save_possible_simple_key
def fetch_flow_sequence_start
def scan_tag_directive_prefix
def fetch_flow_sequence_end
def scan_block_scalar_indicators
def stale_possible_simple_keys
def scan_tag_directive_value
def scan_yaml_directive_value
def scan_block_scalar_indentation
def scan_directive_ignored_line
def fetch_flow_collection_end
def fetch_flow_mapping_start
indent
In flow context, tokens should respect indentation.
def fetch_flow_collection_start