Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 0b3ee79

Browse files
T2T Teamcopybara-github
authored andcommitted
Fixes python3 related unicode errors in wiki_revision_utils.
PiperOrigin-RevId: 404870305
1 parent 79b920f commit 0b3ee79

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

tensor2tensor/data_generators/wiki_revision_utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@
3333
import tensorflow.compat.v1 as tf
3434

3535

36+
def to_unicode(s):
37+
return s.decode("utf-8")
38+
39+
3640
def include_revision(revision_num, skip_factor=1.1):
3741
"""Decide whether to include a revision.
3842
@@ -76,6 +80,10 @@ def file_page_generator(my_file, max_page_size=2**28):
7680
leftovers = ""
7781
while True:
7882
chunk = my_file.read(chunk_size)
83+
try:
84+
chunk = to_unicode(chunk)
85+
except UnicodeDecodeError:
86+
chunk = ""
7987
if not chunk:
8088
break
8189
chunk = leftovers + chunk
@@ -112,7 +120,7 @@ def get_title(page):
112120
assert start_pos != -1
113121
assert end_pos != -1
114122
start_pos += len("<title>")
115-
return text_encoder.to_unicode_utf8(page[start_pos:end_pos])
123+
return page[start_pos:end_pos]
116124

117125

118126
def get_id(page):
@@ -251,7 +259,6 @@ def get_text(revision, strip=True):
251259
ret = revision[end_tag_pos:end_pos]
252260
if strip:
253261
ret = strip_text(ret)
254-
ret = text_encoder.to_unicode_utf8(ret)
255262
return ret
256263

257264

0 commit comments

Comments
 (0)