Internal

T2T Team · copybara-github · commit ae042f66e013 · 2021-06-21T19:47:08.000-07:00
PiperOrigin-RevId: 380716688
diff --git a/tensor2tensor/data_generators/text_problems.py b/tensor2tensor/data_generators/text_problems.py
@@ -799,6 +799,32 @@ def text2real_txt_iterator(source_txt_path, target_txt_path):
     yield {"inputs": inputs, "targets": targets}
 
 
+def txt_line_sharded_iterator(txt_pattern):
+  """Iterate through lines of sharded file."""
+  all_files = tf.gfile.Glob(txt_pattern)
+  for txt_path in all_files:
+    with tf.gfile.Open(txt_path) as f:
+      for line in f:
+        yield line.strip()
+
+
+def text2text_txt_sharded_iterator(source_txt_pattern, target_txt_pattern):
+  """Yield dicts for Text2TextProblem.generate_samples from lines of files.
+
+  Args:
+    source_txt_pattern: path to the sharded source file
+    target_txt_pattern: path to the sharded target file
+
+  Yields:
+    {"inputs": inputs, "targets": targets}
+
+  """
+  for inputs, targets in zip(
+      txt_line_sharded_iterator(source_txt_pattern),
+      txt_line_sharded_iterator(target_txt_pattern)):
+    yield {"inputs": inputs, "targets": targets}
+
+
 def text2text_txt_tab_iterator(txt_path):
   """Yield dicts for Text2TextProblem.generate_samples from lines of txt_path.
 
@@ -848,6 +874,7 @@ class Text2textTmpdir(Text2TextProblem):
   TRAIN_FILES = ("inputs.train.txt", "targets.train.txt")
   EVAL_FILES = ("inputs.eval.txt", "targets.eval.txt")
 
+  @property
   def is_generate_per_split(self):
     return True
 
diff --git a/tensor2tensor/data_generators/translate.py b/tensor2tensor/data_generators/translate.py
@@ -266,6 +266,7 @@ def compile_data(tmp_dir, datasets, filename, datatypes_to_clean=None):
 class TranslateDistillProblem(TranslateProblem):
   """Base class for translation problems."""
 
+  @property
   def is_generate_per_split(self):
     return True
 
@@ -311,3 +312,37 @@ def generate_samples(self, data_dir, tmp_dir, dataset_split):
     return text_problems.text2text_distill_iterator(data_path + "inputs",
                                                     data_path + "gold",
                                                     data_path + "prediction")
+
+
+class TranslateWmt20Problem(TranslateProblem):
+  """Base class for WMT20 Datasets."""
+
+  @property
+  def is_generate_per_split(self):
+    return True
+
+  def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
+    generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
+    vocab = self.get_or_create_vocab(data_dir, tmp_dir)
+    # For each example, encode the text and append EOS ID.
+    for sample in generator:
+      if self.has_inputs:
+        sample["inputs"] = vocab.encode(sample["inputs"])
+        sample["inputs"].append(text_encoder.EOS_ID)
+        sample["targets"] = vocab.encode(sample["targets"])
+        sample["targets"].append(text_encoder.EOS_ID)
+        yield sample
+
+  def generate_text_for_vocab(self, data_dir, tmp_dir):
+    for i, sample in enumerate(
+        self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN)):
+      if self.has_inputs:
+        yield sample["inputs"]
+      yield sample["targets"]
+      if self.max_samples_for_vocab and (i + 1) >= self.max_samples_for_vocab:
+        break
+
+  def generate_samples(self, data_dir, tmp_dir, dataset_split):
+    data_path = self.source_data_files(dataset_split)[0]
+    assert tf.gfile.Exists(data_path)
+    return text_problems.text2text_txt_tab_iterator(data_path)