Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 5ca66de

Browse files
T2T Teamcopybara-github
authored andcommitted
Internal
PiperOrigin-RevId: 392122923
1 parent 7ae6d28 commit 5ca66de

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

tensor2tensor/data_generators/text_problems.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,11 @@ def text2text_txt_tab_iterator(txt_path):
835835
Yields:
836836
{"inputs": inputs, "targets": targets}
837837
"""
838-
for line in txt_line_iterator(txt_path):
838+
if txt_path.endswith(".tsv*"):
839+
data_iterator = txt_line_sharded_iterator(txt_path)
840+
else:
841+
data_iterator = txt_line_iterator(txt_path)
842+
for line in data_iterator:
839843
if line and "\t" in line:
840844
parts = line.split("\t", 1)
841845
inputs, targets = parts[:2]

tensor2tensor/data_generators/translate.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,5 +344,13 @@ def generate_text_for_vocab(self, data_dir, tmp_dir):
344344

345345
def generate_samples(self, data_dir, tmp_dir, dataset_split):
346346
data_path = self.source_data_files(dataset_split)[0]
347-
assert tf.gfile.Exists(data_path)
348347
return text_problems.text2text_txt_tab_iterator(data_path)
348+
349+
350+
class TranslateSamanantarProblem(TranslateWmt20Problem):
351+
"""Base class for Samanantar Datasets."""
352+
353+
def generate_samples(self, data_dir, tmp_dir, dataset_split):
354+
src_data_path = self.source_data_files(dataset_split)[0]
355+
tgt_data_path = self.source_data_files(dataset_split)[1]
356+
return text_problems.text2text_txt_iterator(src_data_path, tgt_data_path)

0 commit comments

Comments
 (0)