1- # Migrated from
2- # https://github.com/swstephe/py2jdbc/blob/master/py2jdbc/mutf8.py
1+ #!/usr/bin/python
2+ # -- Content-Encoding: utf-8 --
3+ """
4+ Implements the support of the Java-specific kind of UTF-8 encoding.
5+
6+ This module is a modified version of ``py2jdbc.mutf8`` provided by
7+ `@guywithface <https://github.com/guywithface>`_.
8+
9+ The project the original file comes from is available at:
10+ https://github.com/swstephe/py2jdbc/
11+
12+ :authors: Scott Stephens (@swstephe), @guywithface
13+ """
14+
15+
16+ NAME = "mutf8" # not cesu-8, which uses a different zero-byte
17+
318
419class DecodeMap (object ):
520 """
@@ -10,8 +25,9 @@ class DecodeMap(object):
1025 def __init__ (self , count , mask , value , bits ):
1126 """
1227 Initialize a DecodeMap, entry from a static dictionary for the module.
13- It automatically calculates the mask for the bits for the value, (always
14- assumed to be at the bottom of the byte).
28+ It automatically calculates the mask for the bits for the value
29+ (always assumed to be at the bottom of the byte).
30+
1531 :param count: The number of bytes in this entire sequence.
1632 :param mask: The mask to apply to the byte at this position.
1733 :param value: The value of masked bits, (without shifting).
@@ -25,15 +41,16 @@ def __init__(self, count, mask, value, bits):
2541
2642 def apply (self , byte , value , data , i , count ):
2743 """
28- Apply mask, compare to expected value, shift and return
29- result. Eventually, this could become a `reduce` function.
44+ Apply mask, compare to expected value, shift and return result.
45+ Eventually, this could become a ``reduce`` function.
46+
3047 :param byte: The byte to compare
3148 :param value: The currently accumulated value.
3249 :param data: The data buffer, (array of bytes).
3350 :param i: The position within the data buffer.
3451 :param count: The position of this comparison.
3552 :return: A new value with the bits merged in.
36- :raises: UnicodeDecodeError if maked bits don't match.
53+ :raises UnicodeDecodeError: if marked bits don't match.
3754 """
3855 if byte & self .mask == self .value :
3956 value <<= self .bits
@@ -70,23 +87,25 @@ def __repr__(self):
7087 (0xc0 , 0x80 , 6 ),
7188 )
7289}
90+
7391DECODE_MAP = dict (
74- (k , tuple (
75- DecodeMap (k , * vv ) for vv in v )
76- )
92+ (k , tuple (DecodeMap (k , * vv ) for vv in v ))
7793 for k , v in DECODER_MAP .items ()
7894)
7995
8096
8197def decoder (data ):
8298 """
83- This generator processes a sequence of bytes in Modified UTF-8 encoding and produces
84- a sequence of unicode string characters. It takes bits from the byte until it matches
85- one of the known encoding serquences.
86- It uses `DecodeMap` to mask, compare and generate values.
99+ This generator processes a sequence of bytes in Modified UTF-8 encoding
100+ and produces a sequence of unicode string characters.
101+
102+ It takes bits from the byte until it matches one of the known encoding
103+ sequences.
104+ It uses ``DecodeMap`` to mask, compare and generate values.
105+
87106 :param data: a string of bytes in Modified UTF-8 encoding.
88107 :return: a generator producing a string of unicode characters
89- :raises: ` UnicodeDecodeError` if unrecognized byte in sequence is encountered.
108+ :raises UnicodeDecodeError: unrecognised byte in sequence encountered.
90109 """
91110 def next_byte (_it , start , count ):
92111 try :
@@ -140,12 +159,14 @@ def next_byte(_it, start, count):
140159
141160def decode_modified_utf8 (data , errors = 'strict' ):
142161 """
143- Decodes a sequence of bytes to a unicode text and length using Modified UTF-8.
144- This function is designed to be used with Python `codecs` module.
162+ Decodes a sequence of bytes to a unicode text and length using
163+ Modified UTF-8.
164+ This function is designed to be used with Python ``codecs`` module.
165+
145166 :param data: a string of bytes in Modified UTF-8
146167 :param errors: handle decoding errors
147168 :return: unicode text and length
148- :raises: ` UnicodeDecodeError` if sequence is invalid.
169+ :raises UnicodeDecodeError: sequence is invalid.
149170 """
150171 value , length = u'' , 0
151172 it = iter (decoder (data ))
@@ -165,5 +186,6 @@ def decode_modified_utf8(data, errors='strict'):
165186 length += 1
166187 return value , length
167188
189+
168190def mutf8_unichr (value ):
169191 return chr (value )
0 commit comments