Skip to content

Commit 448b452

Browse files
committed
Added @guywithface in AUTHORS + doc for mutf8
1 parent 7b7b93b commit 448b452

File tree

2 files changed

+41
-18
lines changed

2 files changed

+41
-18
lines changed

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ Many thanks to the contributors:
99
* @voetsjoeba
1010
* Vadim Markovtsev (@vmarkovtsev)
1111
* Jason Spencer, Google LLC (@j8spencer)
12+
* @guywithface

modifiedutf8.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
1-
# Migrated from
2-
# https://github.com/swstephe/py2jdbc/blob/master/py2jdbc/mutf8.py
1+
#!/usr/bin/python
2+
# -- Content-Encoding: utf-8 --
3+
"""
4+
Implements the support of the Java-specific kind of UTF-8 encoding.
5+
6+
This module is a modified version of ``py2jdbc.mutf8`` provided by
7+
`@guywithface <https://github.com/guywithface>`_.
8+
9+
The project the original file comes from is available at:
10+
https://github.com/swstephe/py2jdbc/
11+
12+
:authors: Scott Stephens (@swstephe), @guywithface
13+
"""
14+
15+
16+
NAME = "mutf8" # not cesu-8, which uses a different zero-byte
17+
318

419
class DecodeMap(object):
520
"""
@@ -10,8 +25,9 @@ class DecodeMap(object):
1025
def __init__(self, count, mask, value, bits):
1126
"""
1227
Initialize a DecodeMap, entry from a static dictionary for the module.
13-
It automatically calculates the mask for the bits for the value, (always
14-
assumed to be at the bottom of the byte).
28+
It automatically calculates the mask for the bits for the value
29+
(always assumed to be at the bottom of the byte).
30+
1531
:param count: The number of bytes in this entire sequence.
1632
:param mask: The mask to apply to the byte at this position.
1733
:param value: The value of masked bits, (without shifting).
@@ -25,15 +41,16 @@ def __init__(self, count, mask, value, bits):
2541

2642
def apply(self, byte, value, data, i, count):
2743
"""
28-
Apply mask, compare to expected value, shift and return
29-
result. Eventually, this could become a `reduce` function.
44+
Apply mask, compare to expected value, shift and return result.
45+
Eventually, this could become a ``reduce`` function.
46+
3047
:param byte: The byte to compare
3148
:param value: The currently accumulated value.
3249
:param data: The data buffer, (array of bytes).
3350
:param i: The position within the data buffer.
3451
:param count: The position of this comparison.
3552
:return: A new value with the bits merged in.
36-
:raises: UnicodeDecodeError if maked bits don't match.
53+
:raises UnicodeDecodeError: if marked bits don't match.
3754
"""
3855
if byte & self.mask == self.value:
3956
value <<= self.bits
@@ -70,23 +87,25 @@ def __repr__(self):
7087
(0xc0, 0x80, 6),
7188
)
7289
}
90+
7391
DECODE_MAP = dict(
74-
(k, tuple(
75-
DecodeMap(k, *vv) for vv in v)
76-
)
92+
(k, tuple(DecodeMap(k, *vv) for vv in v))
7793
for k, v in DECODER_MAP.items()
7894
)
7995

8096

8197
def decoder(data):
8298
"""
83-
This generator processes a sequence of bytes in Modified UTF-8 encoding and produces
84-
a sequence of unicode string characters. It takes bits from the byte until it matches
85-
one of the known encoding serquences.
86-
It uses `DecodeMap` to mask, compare and generate values.
99+
This generator processes a sequence of bytes in Modified UTF-8 encoding
100+
and produces a sequence of unicode string characters.
101+
102+
It takes bits from the byte until it matches one of the known encoding
103+
sequences.
104+
It uses ``DecodeMap`` to mask, compare and generate values.
105+
87106
:param data: a string of bytes in Modified UTF-8 encoding.
88107
:return: a generator producing a string of unicode characters
89-
:raises: `UnicodeDecodeError` if unrecognized byte in sequence is encountered.
108+
:raises UnicodeDecodeError: unrecognised byte in sequence encountered.
90109
"""
91110
def next_byte(_it, start, count):
92111
try:
@@ -140,12 +159,14 @@ def next_byte(_it, start, count):
140159

141160
def decode_modified_utf8(data, errors='strict'):
142161
"""
143-
Decodes a sequence of bytes to a unicode text and length using Modified UTF-8.
144-
This function is designed to be used with Python `codecs` module.
162+
Decodes a sequence of bytes to a unicode text and length using
163+
Modified UTF-8.
164+
This function is designed to be used with Python ``codecs`` module.
165+
145166
:param data: a string of bytes in Modified UTF-8
146167
:param errors: handle decoding errors
147168
:return: unicode text and length
148-
:raises: `UnicodeDecodeError` if sequence is invalid.
169+
:raises UnicodeDecodeError: sequence is invalid.
149170
"""
150171
value, length = u'', 0
151172
it = iter(decoder(data))
@@ -165,5 +186,6 @@ def decode_modified_utf8(data, errors='strict'):
165186
length += 1
166187
return value, length
167188

189+
168190
def mutf8_unichr(value):
169191
return chr(value)

0 commit comments

Comments
 (0)