Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
reader.py
Go to the documentation of this file.
1 # This module contains abstractions for the input stream. You don't have to
2 # looks further, there are no pretty code.
3 #
4 # We define two classes here.
5 #
6 # Mark(source, line, column)
7 # It's just a record and its only use is producing nice error messages.
8 # Parser does not use it for any other purposes.
9 #
10 # Reader(source, data)
11 # Reader determines the encoding of `data` and converts it to unicode.
12 # Reader provides the following methods and attributes:
13 # reader.peek(length=1) - return the next `length` characters
14 # reader.forward(length=1) - move the current position to `length` characters.
15 # reader.index - the number of the current character.
16 # reader.line, stream.column - the line and the column of the current character.
17 # (c) Copyright Rosetta Commons Member Institutions.
18 # (c) This file is part of the Rosetta software suite and is made available under license.
19 # (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
20 # (c) For more information, see http://www.rosettacommons.org. Questions about this can be
21 # (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
22 
23 __all__ = ['Reader', 'ReaderError']
24 
25 from error import YAMLError, Mark
26 
27 import codecs, re
28 
29 # Unfortunately, codec functions in Python 2.3 does not support the `finish`
30 # arguments, so we have to write our own wrappers.
31 
32 try:
33  codecs.utf_8_decode('', 'strict', False)
34  from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode
35 
36 except TypeError:
37 
38  def utf_16_le_decode(data, errors, finish=False):
39  if not finish and len(data) % 2 == 1:
40  data = data[:-1]
41  return codecs.utf_16_le_decode(data, errors)
42 
43  def utf_16_be_decode(data, errors, finish=False):
44  if not finish and len(data) % 2 == 1:
45  data = data[:-1]
46  return codecs.utf_16_be_decode(data, errors)
47 
48  def utf_8_decode(data, errors, finish=False):
49  if not finish:
50  # We are trying to remove a possible incomplete multibyte character
51  # from the suffix of the data.
52  # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd.
53  # All further bytes are in the range 0x80 to 0xbf.
54  # UTF-8 encoded UCS characters may be up to six bytes long.
55  count = 0
56  while count < 5 and count < len(data) \
57  and '\x80' <= data[-count-1] <= '\xBF':
58  count -= 1
59  if count < 5 and count < len(data) \
60  and '\xC0' <= data[-count-1] <= '\xFD':
61  data = data[:-count-1]
62  return codecs.utf_8_decode(data, errors)
63 
64 class ReaderError(YAMLError):
65 
66  def __init__(self, name, position, character, encoding, reason):
67  self.name = name
68  self.character = character
69  self.position = position
70  self.encoding = encoding
71  self.reason = reason
72 
73  def __str__(self):
74  if isinstance(self.character, str):
75  return "'%s' codec can't decode byte #x%02x: %s\n" \
76  " in \"%s\", position %d" \
77  % (self.encoding, ord(self.character), self.reason,
78  self.name, self.position)
79  else:
80  return "unacceptable character #x%04x: %s\n" \
81  " in \"%s\", position %d" \
82  % (ord(self.character), self.reason,
83  self.name, self.position)
84 
85 class Reader(object):
86  # Reader:
87  # - determines the data encoding and converts it to unicode,
88  # - checks if characters are in allowed range,
89  # - adds '\0' to the end.
90 
91  # Reader accepts
92  # - a `str` object,
93  # - a `unicode` object,
94  # - a file-like object with its `read` method returning `str`,
95  # - a file-like object with its `read` method returning `unicode`.
96 
97  # Yeah, it's ugly and slow.
98 
99  def __init__(self, stream):
100  self.name = None
101  self.stream = None
102  self.stream_pointer = 0
103  self.eof = True
104  self.buffer = u''
105  self.pointer = 0
106  self.raw_buffer = None
107  self.raw_decode = None
108  self.encoding = None
109  self.index = 0
110  self.line = 0
111  self.column = 0
112  if isinstance(stream, unicode):
113  self.name = "<unicode string>"
114  self.check_printable(stream)
115  self.buffer = stream+u'\0'
116  elif isinstance(stream, str):
117  self.name = "<string>"
118  self.raw_buffer = stream
119  self.determine_encoding()
120  else:
121  self.stream = stream
122  self.name = getattr(stream, 'name', "<file>")
123  self.eof = False
124  self.raw_buffer = ''
125  self.determine_encoding()
126 
127  def peek(self, index=0):
128  try:
129  return self.buffer[self.pointer+index]
130  except IndexError:
131  self.update(index+1)
132  return self.buffer[self.pointer+index]
133 
134  def prefix(self, length=1):
135  if self.pointer+length >= len(self.buffer):
136  self.update(length)
137  return self.buffer[self.pointer:self.pointer+length]
138 
139  def forward(self, length=1):
140  if self.pointer+length+1 >= len(self.buffer):
141  self.update(length+1)
142  while length:
143  ch = self.buffer[self.pointer]
144  self.pointer += 1
145  self.index += 1
146  if ch in u'\n\x85\u2028\u2029' \
147  or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):
148  self.line += 1
149  self.column = 0
150  elif ch != u'\uFEFF':
151  self.column += 1
152  length -= 1
153 
154  def get_mark(self):
155  if self.stream is None:
156  return Mark(self.name, self.index, self.line, self.column,
157  self.buffer, self.pointer)
158  else:
159  return Mark(self.name, self.index, self.line, self.column,
160  None, None)
161 
163  while not self.eof and len(self.raw_buffer) < 2:
164  self.update_raw()
165  if not isinstance(self.raw_buffer, unicode):
166  if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
167  self.raw_decode = utf_16_le_decode
168  self.encoding = 'utf-16-le'
169  elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
170  self.raw_decode = utf_16_be_decode
171  self.encoding = 'utf-16-be'
172  else:
173  self.raw_decode = utf_8_decode
174  self.encoding = 'utf-8'
175  self.update(1)
176 
177  NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
178  def check_printable(self, data):
179  match = self.NON_PRINTABLE.search(data)
180  if match:
181  character = match.group()
182  position = self.index+(len(self.buffer)-self.pointer)+match.start()
183  raise ReaderError(self.name, position, character,
184  'unicode', "special characters are not allowed")
185 
186  def update(self, length):
187  if self.raw_buffer is None:
188  return
189  self.buffer = self.buffer[self.pointer:]
190  self.pointer = 0
191  while len(self.buffer) < length:
192  if not self.eof:
193  self.update_raw()
194  if self.raw_decode is not None:
195  try:
196  data, converted = self.raw_decode(self.raw_buffer,
197  'strict', self.eof)
198  except UnicodeDecodeError, exc:
199  character = exc.object[exc.start]
200  if self.stream is not None:
201  position = self.stream_pointer-len(self.raw_buffer)+exc.start
202  else:
203  position = exc.start
204  raise ReaderError(self.name, position, character,
205  exc.encoding, exc.reason)
206  else:
207  data = self.raw_buffer
208  converted = len(data)
209  self.check_printable(data)
210  self.buffer += data
211  self.raw_buffer = self.raw_buffer[converted:]
212  if self.eof:
213  self.buffer += u'\0'
214  self.raw_buffer = None
215  break
216 
217  def update_raw(self, size=1024):
218  data = self.stream.read(size)
219  if data:
220  self.raw_buffer += data
221  self.stream_pointer += len(data)
222  else:
223  self.eof = True
224 
225 #try:
226 # import psyco
227 # psyco.bind(Reader)
228 #except ImportError:
229 # pass
230 
def utf_16_le_decode
Definition: reader.py:38
def utf_8_decode
Definition: reader.py:48
Fstring::size_type len(Fstring const &s)
Length.
Definition: Fstring.hh:2207
def determine_encoding
Definition: reader.py:162
def utf_16_be_decode
Definition: reader.py:43