diff --git a/InternVideo2/multi_modality/dataset/ret_dataset.py b/InternVideo2/multi_modality/dataset/ret_dataset.py index 22bf4cff..d64ad35e 100644 --- a/InternVideo2/multi_modality/dataset/ret_dataset.py +++ b/InternVideo2/multi_modality/dataset/ret_dataset.py @@ -388,7 +388,8 @@ def build_subtitle_data(self): def preprocess_para_retrieval_data(anno_list): processed_anno_list = [] for d in anno_list: - d["caption"] = " ".join(d.pop("caption")) + if isinstance(d['caption'], list): + d["caption"] = " ".join(d.pop("caption")) processed_anno_list.append(d) return processed_anno_list @@ -487,4 +488,4 @@ def __getitem__(self, index): if audio is None and self.zero_audio_padding_for_video: media[0] = torch.zeros((998, 64), dtype=torch.float32) - return media, index \ No newline at end of file + return media, index