update and add Expectations for mistral3/internvl tests (#42616)

Abdennacer-Badaoui · web-flow · commit 81b84175e2d9 · 2025-12-08T09:53:36.000+01:00
diff --git a/tests/models/internvl/test_modeling_internvl.py b/tests/models/internvl/test_modeling_internvl.py
@@ -430,7 +430,14 @@ def test_qwen2_small_model_integration_batched_generate_multi_image(self):
         # Check first output
         decoded_output = processor.decode(output[0], skip_special_tokens=True)
         # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
-        expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."  # fmt: skip
+        expected_outputs = Expectations(
+            {
+                ("xpu", 3): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature\'s peace.',
+                ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature\'s peace.',
+                ("rocm", (9, 4)): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature\'s embrace.',
+            }
+        )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
         self.assertEqual(
             decoded_output,
             expected_output,
@@ -443,6 +450,7 @@ def test_qwen2_small_model_integration_batched_generate_multi_image(self):
             {
                 ("xpu", 3): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the differences:\n\n1. **Foreground",
                 ("cuda", 7): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the differences:\n\n1. **Foreground",
+                ("rocm", (9, 4)): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the main differences:\n\n1. **",
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -567,6 +575,7 @@ def test_qwen2_small_model_integration_interleaved_images_videos(self):
             {
                 ("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an",
                 ("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an',
+                ("rocm", (9, 4)): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - This image features the Statue of Liberty on Liberty',
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -582,6 +591,7 @@ def test_qwen2_small_model_integration_interleaved_images_videos(self):
             {
                 ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot",
                 ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
+                ("rocm", (9, 4)): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -593,9 +603,14 @@ def test_qwen2_small_model_integration_interleaved_images_videos(self):
 
         # Check third output
         decoded_output = processor.decode(output[2], skip_special_tokens=True)
-        expected_output = (
-            "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."
-        )
+        expected_outputs = Expectations(
+            {
+                ("xpu", 3): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature\'s peace.',
+                ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature\'s peace.',
+                ("rocm", (9, 4)): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature\'s embrace.',
+            }
+        )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
         self.assertEqual(
             decoded_output,
             expected_output,
@@ -658,7 +673,7 @@ def test_llama_small_model_integration_forward(self):
                 ("xpu", 3): [-9.8828,  -0.4954,   1.4561, -10.3438, -10.3438],
                 ("cuda", 7): [-9.8750,  -0.4861,   1.4648, -10.3359, -10.3359],
                 ("cuda", 8): [-9.8906,  -0.4995,   1.4473, -10.3359, -10.3438],
-                ("rocm", (9, 4)): [ -9.8828,  -0.5005,   1.4697, -10.3438, -10.3438],
+                ("rocm", (9, 4)): [ -9.8672,  -0.4888,   1.4648, -10.3281, -10.3281],
                 ("rocm", (9, 5)): [ -9.8906,  -0.4976,   1.4502, -10.3359, -10.3438],
             }
         )  # fmt: skip
@@ -934,7 +949,7 @@ def test_llama_small_model_integration_interleaved_images_videos(self):
                 ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **",
                 ("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
                 ("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
-                ("rocm", (9, 4)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
+                ("rocm", (9, 4)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
                 ("rocm", (9, 5)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
             }
         )  # fmt: skip
diff --git a/tests/models/mistral3/test_modeling_mistral3.py b/tests/models/mistral3/test_modeling_mistral3.py
@@ -355,7 +355,8 @@ def test_mistral3_integration_batched_generate(self):
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "Calm lake's mirror gleams,\nWhispering pines stand in silence,\nPath to peace begins.",
-                ("cuda", 8): "Wooden path to calm,\nReflections whisper secrets,\nNature's peace unfolds.",
+                ("cuda", (8, 0)): "Wooden path to calm,\nReflections whisper secrets,\nNature's peace unfolds.",
+                ("cuda", (8, 6)): "Calm waters reflect\nWooden path to distant shore\nSilence in the woods",
                 ("rocm", (9, 5)): "Calm waters reflect\nWooden path to distant shore\nSilence in the scene"
             }
         )  # fmt: skip
@@ -432,7 +433,8 @@ def test_mistral3_integration_batched_generate_multi_image(self):
         decoded_output = processor.decode(gen_tokens[0], skip_special_tokens=True)
         expected_outputs = Expectations(
             {
-                ("cuda", 8): 'Calm waters reflect\nWooden path to distant shore\nSilence in the scene',
+                ("cuda", 8): "Calm waters reflect\nWooden path to distant shore\nPeace in nature's hold",
+                ("rocm", (9, 4)): "Calm waters reflect\nWooden path to distant shore\nSilence in the pines"
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -448,6 +450,7 @@ def test_mistral3_integration_batched_generate_multi_image(self):
             {
                 ("xpu", 3): "Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City.",
                 ("cuda", 8): 'Certainly! The images depict two famous landmarks in the United States:\n\n1. The first image shows the Statue of Liberty,',
+                ("rocm", (9, 4)): 'Certainly! The images depict two famous landmarks in the United States:\n\n1. The first image shows the Statue of Liberty,',
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()

Original file line number	Diff line number	Diff line change
`@@ -355,7 +355,8 @@ def test_mistral3_integration_batched_generate(self):`
`355`	`355`	`expected_outputs = Expectations(`
`356`	`356`	`{`
`357`	`357`	`("xpu", 3): "Calm lake's mirror gleams,\nWhispering pines stand in silence,\nPath to peace begins.",`
`358`		`- ("cuda", 8): "Wooden path to calm,\nReflections whisper secrets,\nNature's peace unfolds.",`
	`358`	`+ ("cuda", (8, 0)): "Wooden path to calm,\nReflections whisper secrets,\nNature's peace unfolds.",`
	`359`	`+ ("cuda", (8, 6)): "Calm waters reflect\nWooden path to distant shore\nSilence in the woods",`
`359`	`360`	`("rocm", (9, 5)): "Calm waters reflect\nWooden path to distant shore\nSilence in the scene"`
`360`	`361`	`}`
`361`	`362`	`) # fmt: skip`
`@@ -432,7 +433,8 @@ def test_mistral3_integration_batched_generate_multi_image(self):`
`432`	`433`	`decoded_output = processor.decode(gen_tokens[0], skip_special_tokens=True)`
`433`	`434`	`expected_outputs = Expectations(`
`434`	`435`	`{`
`435`		`- ("cuda", 8): 'Calm waters reflect\nWooden path to distant shore\nSilence in the scene',`
	`436`	`+ ("cuda", 8): "Calm waters reflect\nWooden path to distant shore\nPeace in nature's hold",`
	`437`	`+ ("rocm", (9, 4)): "Calm waters reflect\nWooden path to distant shore\nSilence in the pines"`
`436`	`438`	`}`
`437`	`439`	`) # fmt: skip`
`438`	`440`	`expected_output = expected_outputs.get_expectation()`
`@@ -448,6 +450,7 @@ def test_mistral3_integration_batched_generate_multi_image(self):`
`448`	`450`	`{`
`449`	`451`	`("xpu", 3): "Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City.",`
`450`	`452`	`("cuda", 8): 'Certainly! The images depict two famous landmarks in the United States:\n\n1. The first image shows the Statue of Liberty,',`
	`453`	`+ ("rocm", (9, 4)): 'Certainly! The images depict two famous landmarks in the United States:\n\n1. The first image shows the Statue of Liberty,',`
`451`	`454`	`}`
`452`	`455`	`) # fmt: skip`
`453`	`456`	`expected_output = expected_outputs.get_expectation()`