3 from __future__ import print_function
5 #__all__ = ['EncDec', 'EncDecSimple', 'EncDecTyped', 'EncDecA',
6 # 'SequenceError', 'Sequencer']
13 '1': struct.Struct('<B'),
14 '2': struct.Struct('<H'),
15 '4': struct.Struct('<I'),
16 '8': struct.Struct('<Q'),
17 '_string_': None, # handled specially
19 for _i in (1, 2, 4, 8):
20 _ProtoStruct[_i] = _ProtoStruct[str(_i)]
24 __metaclass__ = abc.ABCMeta
26 Base class for en/de-coders, which are put into sequencers.
28 All have a name and arbitrary user-supplied auxiliary data
31 All provide a pack() and unpack(). The pack() function
32 returns a "bytes" value. This is internally implemented as a
33 function apack() that returns a list of struct.pack() bytes,
34 and pack() just joins them up as needed.
36 The pack/unpack functions take a dictionary of variable names
37 and values, and a second dictionary for conditionals, but at
38 this level conditionals don't apply: they are just being
39 passed through. Variable names do apply to array encoders
41 EncDec also provide b2s() and s2b() static methods, which
42 convert strings to bytes and vice versa, as reversibly as
43 possible (using surrogateescape encoding). In Python2 this is
44 a no-op since the string type *is* the bytes type (<type
45 'unicode'>) is the unicode-ized string type).
47 EncDec also provides b2u() and u2b() to do conversion to/from
50 These are partly for internal use (all strings get converted
51 to UTF-8 byte sequences when coding a _string_ type) and partly
52 for doctests, where we just want some py2k/py3k compat hacks.
54 def __init__(self, name, aux):
59 def b2u(byte_sequence):
60 "transform bytes to unicode"
61 return byte_sequence.decode('utf-8', 'surrogateescape')
64 def u2b(unicode_sequence):
65 "transform unicode to bytes"
66 return unicode_sequence.encode('utf-8', 'surrogateescape')
68 if sys.version_info[0] >= 3:
72 "transform string to bytes (leaves raw byte sequence unchanged)"
73 if isinstance(string, bytes):
75 return string.encode('utf-8', 'surrogateescape')
78 def b2s(byte_sequence):
79 "transform bytes to string - no-op in python2.7"
83 "transform string or unicode to bytes"
84 if isinstance(string, unicode):
85 return string.encode('utf-8', 'surrogateescape')
88 def pack(self, vdict, cdict, val):
89 "encode value <val> into a byte-string"
90 return b''.join(self.apack(vdict, cdict, val))
93 def apack(self, vdict, cdict, val):
94 "encode value <val> into [bytes1, b2, ..., bN]"
97 def unpack(self, vdict, cdict, bstring, offset, noerror=False):
98 "unpack bytes from <bstring> at <offset>"
101 class EncDecSimple(EncDec):
103 Encode/decode a simple (but named) field. The field is not an
104 array, which requires using EncDecA, nor a typed object
105 like a qid or stat instance -- those require a Sequence and
108 The format is one of '1'/1, '2'/2, '4'/4, '8'/8, or '_string_'.
110 Note: using b2s here is purely a doctest/tetsmod python2/python3
111 compat hack. The output of e.pack is <type 'bytes'>; b2s
112 converts it to a string, purely for display purposes. (It might
113 be better to map py2 output to bytes but they just print as a
114 string anyway.) In normal use, you should not call b2s here.
116 >>> e = EncDecSimple('eggs', 2)
117 >>> e.b2s(e.pack({}, {}, 0))
119 >>> e.b2s(e.pack({}, {}, 256))
122 Values that cannot be packed produce a SequenceError:
124 >>> e.pack({}, {}, None)
125 Traceback (most recent call last):
127 SequenceError: failed while packing 'eggs'=None
128 >>> e.pack({}, {}, -1)
129 Traceback (most recent call last):
131 SequenceError: failed while packing 'eggs'=-1
133 Unpacking both returns a value, and tells how many bytes it
134 used out of the bytestring or byte-array argument. If there
135 are not enough bytes remaining at the starting offset, it
136 raises a SequenceError, unless noerror=True (then unset
139 >>> e.unpack({}, {}, b'\x00\x01', 0)
141 >>> e.unpack({}, {}, b'', 0)
142 Traceback (most recent call last):
144 SequenceError: out of data while unpacking 'eggs'
145 >>> e.unpack({}, {}, b'', 0, noerror=True)
148 Note that strings can be provided as regular strings, byte
149 strings (same as regular strings in py2k), or Unicode strings
150 (same as regular strings in py3k). Unicode strings will be
151 converted to UTF-8 before being packed. Since this leaves
152 7-bit characters alone, these examples work in both py2k and
153 py3k. (Note: the UTF-8 encoding of u'\u1234' is
154 '\0xe1\0x88\0xb4' or 225, 136, 180. The b2i trick below is
155 another py2k vs py3k special case just for doctests: py2k
156 tries to display the utf-8 encoded data as a string.)
158 >>> e = EncDecSimple('spam', '_string_')
159 >>> e.b2s(e.pack({}, {}, 'p3=unicode,p2=bytes'))
160 '\x13\x00p3=unicode,p2=bytes'
162 >>> e.b2s(e.pack({}, {}, b'bytes'))
166 >>> ispy3k = sys.version_info[0] >= 3
168 >>> b2i = lambda x: x if ispy3k else ord(x)
169 >>> [b2i(x) for x in e.pack({}, {}, u'\u1234')]
170 [3, 0, 225, 136, 180]
172 The byte length of the utf-8 data cannot exceed 65535 since
173 the encoding has the length as a 2-byte field (a la the
174 encoding for 'eggs' here). A too-long string produces
175 a SequenceError as well.
177 >>> e.pack({}, {}, 16384 * 'spam')
178 Traceback (most recent call last):
180 SequenceError: string too long (len=65536) while packing 'spam'
182 Unpacking strings produces byte arrays. (Of course,
183 in py2k these are also known as <type 'str'>.)
185 >>> unpacked = e.unpack({}, {}, b'\x04\x00data', 0)
186 >>> etype = bytes if ispy3k else str
187 >>> print(isinstance(unpacked[0], etype))
189 >>> e.b2s(unpacked[0])
194 You may use e.b2s() to conver them to unicode strings in py3k,
195 or you may set e.autob2s. This still only really does
196 anything in py3k, since py2k strings *are* bytes, so it's
197 really just intended for doctest purposes (see EncDecA):
200 >>> e.unpack({}, {}, b'\x07\x00stringy', 0)
203 def __init__(self, name, fmt, aux=None):
204 super(EncDecSimple, self).__init__(name, aux)
206 self.struct = _ProtoStruct[fmt]
211 return '{0}({1!r}, {2!r})'.format(self.__class__.__name__,
213 return '{0}({1!r}, {2!r}, {3!r})'.format(self.__class__.__name__,
214 self.name, self.fmt, self.aux)
218 def apack(self, vdict, cdict, val):
222 return [self.struct.pack(val)]
224 if len(sval) > 65535:
225 raise SequenceError('string too long (len={0:d}) '
226 'while packing {1!r}'.format(len(sval), self.name))
227 return [EncDecSimple.string_len.pack(len(sval)), sval]
228 # Include AttributeError in case someone tries to, e.g.,
229 # pack name=None and self.s2b() tries to use .encode on it.
230 except (struct.error, AttributeError):
231 raise SequenceError('failed '
232 'while packing {0!r}={1!r}'.format(self.name, val))
234 def _unpack1(self, via, bstring, offset, noerror):
235 "internal function to unpack single item"
237 tup = via.unpack_from(bstring, offset)
238 except struct.error as err:
239 if 'unpack_from requires a buffer of at least' in str(err):
241 return None, offset + via.size
242 raise SequenceError('out of data '
243 'while unpacking {0!r}'.format(self.name))
244 # not clear what to do here if noerror
245 raise SequenceError('failed '
246 'while unpacking {0!r}'.format(self.name))
248 return tup[0], offset + via.size
250 def unpack(self, vdict, cdict, bstring, offset, noerror=False):
251 "decode a value; return the value and the new offset"
253 return self._unpack1(self.struct, bstring, offset, noerror)
254 slen, offset = self._unpack1(EncDecSimple.string_len, bstring, offset,
258 nexto = offset + slen
259 if len(bstring) < nexto:
263 raise SequenceError('out of data '
264 'while unpacking {0!r}'.format(self.name))
266 val = bstring[offset:nexto]
271 # string length: 2 byte unsigned field
272 EncDecSimple.string_len = _ProtoStruct[2]
274 class EncDecTyped(EncDec):
276 EncDec for typed objects (which are build from PFODs, which are
277 a sneaky class variant of OrderedDict similar to namedtuple).
279 Calling the klass() function with no arguments must create an
280 instance with all-None members.
282 We also require a Sequencer to pack and unpack the members of
285 >>> qid_s = Sequencer('qid')
286 >>> qid_s.append_encdec(None, EncDecSimple('type', 1))
287 >>> qid_s.append_encdec(None, EncDecSimple('version', 4))
288 >>> qid_s.append_encdec(None, EncDecSimple('path', 8))
292 >>> from pfod import pfod
293 >>> qid = pfod('qid', ['type', 'version', 'path'])
296 >>> qid_inst = qid(1, 2, 3)
298 qid(type=1, version=2, path=3)
300 >>> e = EncDecTyped(qid, 'aqid', qid_s)
301 >>> e.b2s(e.pack({}, {}, qid_inst))
302 '\x01\x02\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00'
304 ... b'\x01\x02\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00', 0)
305 (qid(type=1, version=2, path=3), 13)
307 If an EncDecTyped instance has a conditional sequencer, note
308 that unpacking will leave un-selected items set to None (see
309 the Sequencer example below):
311 >>> breakfast = pfod('breakfast', 'eggs spam ham')
313 breakfast(eggs=None, spam=None, ham=None)
314 >>> bfseq = Sequencer('breakfast')
315 >>> bfseq.append_encdec(None, EncDecSimple('eggs', 1))
316 >>> bfseq.append_encdec('yuck', EncDecSimple('spam', 1))
317 >>> bfseq.append_encdec(None, EncDecSimple('ham', 1))
318 >>> e = EncDecTyped(breakfast, 'bfname', bfseq)
319 >>> e.unpack({}, {'yuck': False}, b'\x02\x01\x04', 0)
320 (breakfast(eggs=2, spam=None, ham=1), 2)
322 This used just two of the three bytes: eggs=2, ham=1.
324 >>> e.unpack({}, {'yuck': True}, b'\x02\x01\x04', 0)
325 (breakfast(eggs=2, spam=1, ham=4), 3)
327 This used the third byte, so ham=4.
329 def __init__(self, klass, name, sequence, aux=None):
330 assert len(sequence) == len(klass()._fields) # temporary
331 super(EncDecTyped, self).__init__(name, aux)
334 self.sequence = sequence
338 return '{0}({1!r}, {2!r}, {3!r})'.format(self.__class__.__name__,
339 self.klass, self.name, self.sequence)
340 return '{0}({1!r}, {2!r}, {3!r}, {4!r})'.format(self.__class__.__name__,
341 self.klass, self.name, self.sequence, self.aux)
345 def apack(self, vdict, cdict, val):
347 Pack each of our instance variables.
349 Note that some packing may be conditional.
351 return self.sequence.apack(val, cdict)
353 def unpack(self, vdict, cdict, bstring, offset, noerror=False):
355 Unpack each instance variable, into a new object of
356 self.klass. Return the new instance and new offset.
358 Note that some unpacking may be conditional.
361 offset = self.sequence.unpack_from(obj, cdict, bstring, offset, noerror)
364 class EncDecA(EncDec):
366 EncDec for arrays (repeated objects).
368 We take the name of repeat count variable, and a sub-coder
369 (Sequencer instance). For instance, we can en/de-code
370 repeat='nwname' copies of name='wname', or nwname of
371 name='wqid', in a Twalk en/de-code.
373 Note that we don't pack or unpack the repeat count itself --
374 that must be done by higher level code. We just get its value
377 >>> subcode = EncDecSimple('wname', '_string_')
378 >>> e = EncDecA('nwname', 'wname', subcode)
379 >>> e.b2s(e.pack({'nwname': 2}, {}, ['A', 'BC']))
380 '\x01\x00A\x02\x00BC'
382 >>> subcode.autob2s = True # so that A and BC decode to py3k str
383 >>> e.unpack({'nwname': 2}, {}, b'\x01\x00A\x02\x00BC', 0)
386 When using noerror, the first sub-item that fails to decode
387 completely starts the None-s. Strings whose length fails to
388 decode are assumed to be zero bytes long as well, for the
389 purpose of showing the expected packet length:
391 >>> e.unpack({'nwname': 2}, {}, b'\x01\x00A\x02\x00', 0, noerror=True)
393 >>> e.unpack({'nwname': 2}, {}, b'\x01\x00A\x02', 0, noerror=True)
395 >>> e.unpack({'nwname': 3}, {}, b'\x01\x00A\x02', 0, noerror=True)
396 (['A', None, None], 7)
398 As a special case, supplying None for the sub-coder
399 makes the repeated item pack or unpack a simple byte
400 string. (Note that autob2s is not supported here.)
401 A too-short byte string is simply truncated!
403 >>> e = EncDecA('count', 'data', None)
404 >>> e.b2s(e.pack({'count': 5}, {}, b'12345'))
406 >>> x = list(e.unpack({'count': 3}, {}, b'123', 0))
407 >>> x[0] = e.b2s(x[0])
410 >>> x = list(e.unpack({'count': 3}, {}, b'12', 0, noerror=True))
411 >>> x[0] = e.b2s(x[0])
415 def __init__(self, repeat, name, sub, aux=None):
416 super(EncDecA, self).__init__(name, aux)
423 return '{0}({1!r}, {2!r}, {3!r})'.format(self.__class__.__name__,
424 self.repeat, self.name, self.sub)
425 return '{0}({1!r}, {2!r}, {3!r}, {4!r})'.format(self.__class__.__name__,
426 self.repeat, self.name, self.sub, self.aux)
430 def apack(self, vdict, cdict, val):
431 "pack each val[i], for i in range(vdict[self.repeat])"
432 num = vdict[self.repeat]
433 assert num == len(val)
435 assert isinstance(val, bytes)
439 parts.extend(self.sub.apack(vdict, cdict, i))
442 def unpack(self, vdict, cdict, bstring, offset, noerror=False):
443 "unpack repeatedly, per self.repeat, into new array."
444 num = vdict[self.repeat]
445 if num is None and noerror:
451 if len(bstring) < nexto and not noerror:
452 raise SequenceError('out of data '
453 'while unpacking {0!r}'.format(self.name))
454 return bstring[offset:nexto], nexto
457 obj, offset = self.sub.unpack(vdict, cdict, bstring, offset,
462 class SequenceError(Exception):
463 "sequence error: item too big, or ran out of data"
466 class Sequencer(object):
468 A sequencer is an object that packs (marshals) or unpacks
469 (unmarshals) a series of objects, according to their EncDec
472 The objects themselves (and their values) come from, or
473 go into, a dictionary: <vdict>, the first argument to
476 Some fields may be conditional. The conditions are in a
477 separate dictionary (the second or <cdict> argument).
479 Some objects may be dictionaries or PFODs, e.g., they may
480 be a Plan9 qid or stat structure. These have their own
483 As with each encoder, we have both an apack() function
484 (returns a list of parts) and a plain pack(). Users should
485 mostly stick with plain pack().
487 >>> s = Sequencer('monty')
490 >>> e = EncDecSimple('eggs', 2)
491 >>> s.append_encdec(None, e)
492 >>> s.append_encdec(None, EncDecSimple('spam', 1))
494 (None, EncDecSimple('eggs', 2))
495 >>> e.b2s(s.pack({'eggs': 513, 'spam': 65}, {}))
498 When particular fields are conditional, they appear in
499 packed output, or are taken from the byte-string during
500 unpacking, only if their condition is true.
502 As with struct, use unpack_from to start at an arbitrary
503 offset and/or omit verification that the entire byte-string
506 >>> s = Sequencer('python')
507 >>> s.append_encdec(None, e)
508 >>> s.append_encdec('.u', EncDecSimple('spam', 1))
510 ('.u', EncDecSimple('spam', 1))
511 >>> e.b2s(s.pack({'eggs': 513, 'spam': 65}, {'.u': True}))
513 >>> e.b2s(s.pack({'eggs': 513, 'spam': 65}, {'.u': False}))
517 >>> s.unpack(d, {'.u': True}, b'\x01\x02A')
518 >>> print(d['eggs'], d['spam'])
521 >>> s.unpack(d, {'.u': False}, b'\x01\x02A', 0)
522 Traceback (most recent call last):
524 SequenceError: 1 byte(s) unconsumed
525 >>> s.unpack_from(d, {'.u': False}, b'\x01\x02A', 0)
530 The incoming dictionary-like object may be pre-initialized
531 if you like; only sequences that decode are filled-in:
533 >>> d = {'eggs': None, 'spam': None}
534 >>> s.unpack_from(d, {'.u': False}, b'\x01\x02A', 0)
536 >>> print(d['eggs'], d['spam'])
539 Some objects may be arrays; if so their EncDec is actually
540 an EncDecA, the repeat count must be in the dictionary, and
541 the object itself must have a len() and be index-able:
543 >>> s = Sequencer('arr')
544 >>> s.append_encdec(None, EncDecSimple('n', 1))
545 >>> ae = EncDecSimple('array', 2)
546 >>> s.append_encdec(None, EncDecA('n', 'array', ae))
547 >>> ae.b2s(s.pack({'n': 2, 'array': [257, 514]}, {}))
548 '\x02\x01\x01\x02\x02'
550 Unpacking an array creates a list of the number of items.
551 The EncDec encoder that decodes the number of items needs to
552 occur first in the sequencer, so that the dictionary will have
553 acquired the repeat-count variable's value by the time we hit
557 >>> s.unpack(d, {}, b'\x01\x04\x00')
558 >>> d['n'], d['array']
561 def __init__(self, name):
564 self.debug = False # or sys.stderr
567 return '{0}({1!r})'.format(self.__class__.__name__, self.name)
572 return len(self._codes)
575 return iter(self._codes)
577 def __getitem__(self, index):
578 return self._codes[index]
580 def dprint(self, *args, **kwargs):
583 if isinstance(self.debug, bool):
587 print(*args, file=dest, **kwargs)
589 def append_encdec(self, cond, code):
590 "add EncDec en/de-coder, conditional on cond"
591 self._codes.append((cond, code))
593 def apack(self, vdict, cdict):
595 Produce packed representation of each field.
598 for cond, code in self._codes:
599 # Skip this item if it's conditional on a false thing.
600 if cond is not None and not cdict[cond]:
601 self.dprint('skip %r - %r is False' % (code, cond))
605 self.dprint('pack %r - no cond or %r is True' % (code, cond))
606 packed_data.extend(code.apack(vdict, cdict, vdict[code.name]))
610 def pack(self, vdict, cdict):
614 return b''.join(self.apack(vdict, cdict))
616 def unpack_from(self, vdict, cdict, bstring, offset=0, noerror=False):
618 Unpack from byte string.
620 The values are unpacked into a dictionary vdict;
621 some of its entries may themselves be ordered
622 dictionaries created by typedefed codes.
624 Raises SequenceError if the string is too short,
625 unless you set noerror, in which case we assume
626 you want see what you can get out of the data.
628 for cond, code in self._codes:
629 # Skip this item if it's conditional on a false thing.
630 if cond is not None and not cdict[cond]:
631 self.dprint('skip %r - %r is False' % (code, cond))
635 self.dprint('unpack %r - no cond or %r is True' % (code, cond))
636 obj, offset = code.unpack(vdict, cdict, bstring, offset, noerror)
637 vdict[code.name] = obj
641 def unpack(self, vdict, cdict, bstring, noerror=False):
643 Like unpack_from but unless noerror=True, requires that
644 we completely use up the given byte string.
646 offset = self.unpack_from(vdict, cdict, bstring, 0, noerror)
647 if not noerror and offset != len(bstring):
648 raise SequenceError('{0} byte(s) unconsumed'.format(
649 len(bstring) - offset))
651 if __name__ == '__main__':