diff --git a/InternVideo2/multi_modality/dataset/ret_dataset.py b/InternVideo2/multi_modality/dataset/ret_dataset.py
index 22bf4cff..d64ad35e 100644
--- a/InternVideo2/multi_modality/dataset/ret_dataset.py
+++ b/InternVideo2/multi_modality/dataset/ret_dataset.py
@@ -388,7 +388,8 @@ def build_subtitle_data(self):
 def preprocess_para_retrieval_data(anno_list):
     processed_anno_list = []
     for d in anno_list:
-        d["caption"] = " ".join(d.pop("caption"))
+        if isinstance(d['caption'], list):
+            d["caption"] = " ".join(d.pop("caption"))
         processed_anno_list.append(d)
     return processed_anno_list
 
@@ -487,4 +488,4 @@ def __getitem__(self, index):
         if audio is None and self.zero_audio_padding_for_video:
             media[0] = torch.zeros((998, 64), dtype=torch.float32)
         
-        return media, index
\ No newline at end of file
+        return media, index