From 5df96281a9d07b576c929e39770e0b3f942e36d4 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 30 Oct 2025 18:29:34 +0000
Subject: [PATCH 01/35] sdks/python: replace the deprecated testcontainer max
 tries

---
 .../ml/rag/enrichment/milvus_search_it_test.py           | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index 2df9af2f1144..5094d9076e93 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -53,7 +53,6 @@
       MilvusClient,
       RRFRanker)
   from pymilvus.milvus_client import IndexParams
-  from testcontainers.core.config import MAX_TRIES as TC_MAX_TRIES
   from testcontainers.core.config import testcontainers_config
   from testcontainers.core.generic import DbContainer
   from testcontainers.milvus import MilvusContainer
@@ -306,13 +305,15 @@ def start_db_container(
       image="milvusdb/milvus:v2.5.10",
       max_vec_fields=5,
       vector_client_max_retries=3,
-      tc_max_retries=TC_MAX_TRIES) -> Optional[MilvusDBContainerInfo]:
+      tc_max_retries=None) -> Optional[MilvusDBContainerInfo]:
     service_container_port = MilvusEnrichmentTestHelper.find_free_port()
     healthcheck_container_port = MilvusEnrichmentTestHelper.find_free_port()
     user_yaml_creator = MilvusEnrichmentTestHelper.create_user_yaml
     with user_yaml_creator(service_container_port, max_vec_fields) as cfg:
       info = None
-      testcontainers_config.max_tries = tc_max_retries
+      original_tc_max_tries = testcontainers_config.max_tries
+      if not testcontainers_config.max_tries:
+        testcontainers_config.max_tries = tc_max_retries
       for i in range(vector_client_max_retries):
         try:
           vector_db_container = CustomMilvusContainer(
@@ -325,7 +326,7 @@ def start_db_container(
           host = vector_db_container.get_container_host_ip()
           port = vector_db_container.get_exposed_port(service_container_port)
           info = MilvusDBContainerInfo(vector_db_container, host, port)
-          testcontainers_config.max_tries = TC_MAX_TRIES
+          testcontainers_config.max_tries = original_tc_max_tries
           _LOGGER.info(
               "milvus db container started successfully on %s.", info.uri)
           break

From 91266a7db291dde1d0536f90be6ecea223b6ced4 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 30 Oct 2025 19:00:03 +0000
Subject: [PATCH 02/35] sdks/python: handle transient testcontainer
 startup/teardown errors

---
 .../transforms/elementwise/enrichment_test.py | 30 +++++++++++--------
 .../rag/enrichment/milvus_search_it_test.py   | 15 ++++------
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
index c8e988a52c5d..083d246a439a 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
@@ -68,6 +68,9 @@ class TestContainerStartupError(Exception):
   """Raised when any test container fails to start."""
   pass
 
+class TestContainerTeardownError(Exception):
+  """Raised when any test container fails to teardown."""
+  pass
 
 def validate_enrichment_with_bigtable():
   expected = '''[START enrichment_with_bigtable]
@@ -186,7 +189,7 @@ def test_enrichment_with_external_pg(self, mock_stdout):
         output = mock_stdout.getvalue().splitlines()
         expected = validate_enrichment_with_external_pg()
         self.assertEqual(output, expected)
-    except TestContainerStartupError as e:
+    except (TestContainerStartupError, TestContainerTeardownError) as e:
       raise unittest.SkipTest(str(e))
     except Exception as e:
       self.fail(f"Test failed with unexpected error: {e}")
@@ -199,7 +202,7 @@ def test_enrichment_with_external_mysql(self, mock_stdout):
         output = mock_stdout.getvalue().splitlines()
         expected = validate_enrichment_with_external_mysql()
         self.assertEqual(output, expected)
-    except TestContainerStartupError as e:
+    except (TestContainerStartupError, TestContainerTeardownError) as e:
       raise unittest.SkipTest(str(e))
     except Exception as e:
       self.fail(f"Test failed with unexpected error: {e}")
@@ -212,7 +215,7 @@ def test_enrichment_with_external_sqlserver(self, mock_stdout):
         output = mock_stdout.getvalue().splitlines()
         expected = validate_enrichment_with_external_sqlserver()
         self.assertEqual(output, expected)
-    except TestContainerStartupError as e:
+    except (TestContainerStartupError, TestContainerTeardownError) as e:
       raise unittest.SkipTest(str(e))
     except Exception as e:
       self.fail(f"Test failed with unexpected error: {e}")
@@ -227,7 +230,7 @@ def test_enrichment_with_milvus(self, mock_stdout):
         output = parse_chunk_strings(output)
         expected = parse_chunk_strings(expected)
         assert_chunks_equivalent(output, expected)
-    except TestContainerStartupError as e:
+    except (TestContainerStartupError, TestContainerTeardownError) as e:
       raise unittest.SkipTest(str(e))
     except Exception as e:
       self.fail(f"Test failed with unexpected error: {e}")
@@ -373,19 +376,17 @@ def post_sql_enrichment_test(res: CloudSQLEnrichmentTestDataConstruct):
   def pre_milvus_enrichment() -> MilvusDBContainerInfo:
     try:
       db = MilvusEnrichmentTestHelper.start_db_container()
-    except Exception as e:
-      raise TestContainerStartupError(
-          f"Milvus container failed to start: {str(e)}")
-
-    connection_params = MilvusConnectionParameters(
+      connection_params = MilvusConnectionParameters(
         uri=db.uri,
         user=db.user,
         password=db.password,
         db_id=db.id,
         token=db.token)
-
-    collection_name = MilvusEnrichmentTestHelper.initialize_db_with_data(
+      collection_name = MilvusEnrichmentTestHelper.initialize_db_with_data(
         connection_params)
+    except Exception as e:
+      raise TestContainerStartupError(
+          f"Milvus container failed to start: {str(e)}")
 
     # Setup environment variables for db and collection configuration. This will
     # be used downstream by the milvus enrichment handler.
@@ -400,7 +401,12 @@ def pre_milvus_enrichment() -> MilvusDBContainerInfo:
 
   @staticmethod
   def post_milvus_enrichment(db: MilvusDBContainerInfo):
-    MilvusEnrichmentTestHelper.stop_db_container(db)
+    try:
+      MilvusEnrichmentTestHelper.stop_db_container(db)
+    except Exception:
+      raise TestContainerTeardownError(
+        f"Milvus container failed to tear down: {str(e)}")
+
     os.environ.pop('MILVUS_VECTOR_DB_URI', None)
     os.environ.pop('MILVUS_VECTOR_DB_USER', None)
     os.environ.pop('MILVUS_VECTOR_DB_PASSWORD', None)
diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index 5094d9076e93..4184aca0bfe9 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -312,7 +312,7 @@ def start_db_container(
     with user_yaml_creator(service_container_port, max_vec_fields) as cfg:
       info = None
       original_tc_max_tries = testcontainers_config.max_tries
-      if not testcontainers_config.max_tries:
+      if testcontainers_config.max_tries is not None:
         testcontainers_config.max_tries = tc_max_retries
       for i in range(vector_client_max_retries):
         try:
@@ -326,7 +326,6 @@ def start_db_container(
           host = vector_db_container.get_container_host_ip()
           port = vector_db_container.get_exposed_port(service_container_port)
           info = MilvusDBContainerInfo(vector_db_container, host, port)
-          testcontainers_config.max_tries = original_tc_max_tries
           _LOGGER.info(
               "milvus db container started successfully on %s.", info.uri)
           break
@@ -351,6 +350,8 @@ def start_db_container(
                 stdout_logs,
                 stderr_logs)
             raise e
+        finally:
+          testcontainers_config.max_tries = original_tc_max_tries
       return info
 
   @staticmethod
@@ -358,13 +359,9 @@ def stop_db_container(db_info: MilvusDBContainerInfo):
     if db_info is None:
       _LOGGER.warning("Milvus db info is None. Skipping stop operation.")
       return
-    try:
-      _LOGGER.debug("Stopping milvus db container.")
-      db_info.container.stop()
-      _LOGGER.info("milvus db container stopped successfully.")
-    except Exception as e:
-      _LOGGER.warning(
-          "Error encountered while stopping milvus db container: %s", e)
+    _LOGGER.debug("Stopping milvus db container.")
+    db_info.container.stop()
+    _LOGGER.info("milvus db container stopped successfully.")
 
   @staticmethod
   def initialize_db_with_data(connc_params: MilvusConnectionParameters):

From fa6d2f06994e6ed14269c39b500b66bf336e3f60 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 14:42:30 +0000
Subject: [PATCH 03/35] sdks/python: bump `testcontainers` py pkg version

---
 sdks/python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 9ed2a124e94d..d7afb0a2f112 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -463,7 +463,7 @@ def get_portability_package_data():
               'sqlalchemy>=1.3,<3.0',
               'psycopg2-binary>=2.8.5,<2.9.10; python_version <= "3.9"',
               'psycopg2-binary>=2.8.5,<3.0; python_version >= "3.10"',
-              'testcontainers[mysql,kafka,milvus]>=4.0.0,<5.0.0',
+              'testcontainers[mysql,kafka,milvus]>=4.13.2,<5.0.0',
               'cryptography>=41.0.2',
               'hypothesis>5.0.0,<7.0.0',
               'virtualenv-clone>=0.5,<1.0',

From 9445aaad9bfb2650081e14348d30df5b1f626a97 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 15:19:07 +0000
Subject: [PATCH 04/35] sdks/python: integrate milvus sink I/O

---
 .../transforms/elementwise/enrichment_test.py |  18 +-
 .../ml/rag/enrichment/milvus_search.py        |  49 +-
 .../rag/enrichment/milvus_search_it_test.py   | 343 ++--------
 .../ml/rag/ingestion/milvus_search.py         | 340 ++++++++++
 .../ml/rag/ingestion/milvus_search_it_test.py | 616 ++++++++++++++++++
 .../ml/rag/ingestion/milvus_search_test.py    | 122 ++++
 .../ml/rag/ingestion/postgres_common.py       |  38 +-
 sdks/python/apache_beam/ml/rag/test_utils.py  | 304 +++++++++
 sdks/python/apache_beam/ml/rag/utils.py       | 129 ++++
 9 files changed, 1632 insertions(+), 327 deletions(-)
 create mode 100644 sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
 create mode 100644 sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
 create mode 100644 sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
 create mode 100644 sdks/python/apache_beam/ml/rag/test_utils.py
 create mode 100644 sdks/python/apache_beam/ml/rag/utils.py

diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
index 083d246a439a..f303b4a670a2 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
@@ -57,8 +57,8 @@
   from apache_beam.ml.rag.enrichment.milvus_search_it_test import (
       MilvusEnrichmentTestHelper,
       MilvusDBContainerInfo,
-      parse_chunk_strings,
       assert_chunks_equivalent)
+  from apache_beam.ml.rag.utils import parse_chunk_strings
   from apache_beam.io.requestresponse import RequestResponseIO
 except ImportError as e:
   raise unittest.SkipTest(f'Examples dependencies are not installed: {str(e)}')
@@ -68,10 +68,12 @@ class TestContainerStartupError(Exception):
   """Raised when any test container fails to start."""
   pass
 
+
 class TestContainerTeardownError(Exception):
   """Raised when any test container fails to teardown."""
   pass
 
+
 def validate_enrichment_with_bigtable():
   expected = '''[START enrichment_with_bigtable]
 Row(sale_id=1, customer_id=1, product_id=1, quantity=1, product={'product_id': '1', 'product_name': 'pixel 5', 'product_stock': '2'})
@@ -377,13 +379,13 @@ def pre_milvus_enrichment() -> MilvusDBContainerInfo:
     try:
       db = MilvusEnrichmentTestHelper.start_db_container()
       connection_params = MilvusConnectionParameters(
-        uri=db.uri,
-        user=db.user,
-        password=db.password,
-        db_id=db.id,
-        token=db.token)
+          uri=db.uri,
+          user=db.user,
+          password=db.password,
+          db_id=db.id,
+          token=db.token)
       collection_name = MilvusEnrichmentTestHelper.initialize_db_with_data(
-        connection_params)
+          connection_params)
     except Exception as e:
       raise TestContainerStartupError(
           f"Milvus container failed to start: {str(e)}")
@@ -405,7 +407,7 @@ def post_milvus_enrichment(db: MilvusDBContainerInfo):
       MilvusEnrichmentTestHelper.stop_db_container(db)
     except Exception:
       raise TestContainerTeardownError(
-        f"Milvus container failed to tear down: {str(e)}")
+          f"Milvus container failed to tear down: {str(e)}")
 
     os.environ.pop('MILVUS_VECTOR_DB_URI', None)
     os.environ.pop('MILVUS_VECTOR_DB_USER', None)
diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
index 431c0db3f416..d488c8d3d80d 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
@@ -25,6 +25,7 @@
 from typing import Optional
 from typing import Tuple
 from typing import Union
+import uuid
 
 from google.protobuf.json_format import MessageToDict
 from pymilvus import AnnSearchRequest
@@ -35,6 +36,7 @@
 
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Embedding
+from apache_beam.ml.rag.utils import MilvusHelpers, MilvusConnectionParameters
 from apache_beam.transforms.enrichment import EnrichmentSourceHandler
 
 
@@ -104,44 +106,6 @@ def __str__(self):
     return self.dict().__str__()
 
 
-@dataclass
-class MilvusConnectionParameters:
-  """Parameters for establishing connections to Milvus servers.
-
-  Args:
-    uri: URI endpoint for connecting to Milvus server in the format
-      "http(s)://hostname:port".
-    user: Username for authentication. Required if authentication is enabled and
-      not using token authentication.
-    password: Password for authentication. Required if authentication is enabled
-      and not using token authentication.
-    db_id: Database ID to connect to. Specifies which Milvus database to use.
-      Defaults to 'default'.
-    token: Authentication token as an alternative to username/password.
-    timeout: Connection timeout in seconds. Uses client default if None.
-    max_retries: Maximum number of connection retry attempts. Defaults to 3.
-    retry_delay: Initial delay between retries in seconds. Defaults to 1.0.
-    retry_backoff_factor: Multiplier for retry delay after each attempt. 
-      Defaults to 2.0 (exponential backoff).
-    kwargs: Optional keyword arguments for additional connection parameters.
-      Enables forward compatibility.
-  """
-  uri: str
-  user: str = field(default_factory=str)
-  password: str = field(default_factory=str)
-  db_id: str = "default"
-  token: str = field(default_factory=str)
-  timeout: Optional[float] = None
-  max_retries: int = 3
-  retry_delay: float = 1.0
-  retry_backoff_factor: float = 2.0
-  kwargs: Dict[str, Any] = field(default_factory=dict)
-
-  def __post_init__(self):
-    if not self.uri:
-      raise ValueError("URI must be provided for Milvus connection")
-
-
 @dataclass
 class BaseSearchParameters:
   """Base parameters for both vector and keyword search operations.
@@ -361,7 +325,7 @@ def __init__(
       **kwargs):
     """
     Example Usage:
-      connection_paramters = MilvusConnectionParameters(
+      connection_parameters = MilvusConnectionParameters(
         uri="http://localhost:19530")
       search_parameters = MilvusSearchParameters(
         collection_name="my_collection",
@@ -369,7 +333,7 @@ def __init__(
       collection_load_parameters = MilvusCollectionLoadParameters(
         load_fields=["embedding", "metadata"]),
       milvus_handler = MilvusSearchEnrichmentHandler(
-        connection_paramters,
+        connection_parameters,
         search_parameters,
         collection_load_parameters=collection_load_parameters,
         min_batch_size=10,
@@ -534,10 +498,7 @@ def _get_keyword_search_data(self, chunk: Chunk):
       raise ValueError(
           f"Chunk {chunk.id} missing both text content and sparse embedding "
           "required for keyword search")
-
-    sparse_embedding = self.convert_sparse_embedding_to_milvus_format(
-        chunk.sparse_embedding)
-
+    sparse_embedding = MilvusHelpers.sparse_embedding(chunk.sparse_embedding)
     return chunk.content.text or sparse_embedding
 
   def _get_call_response(
diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index 4184aca0bfe9..094788664bdb 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -57,6 +57,8 @@
   from testcontainers.core.generic import DbContainer
   from testcontainers.milvus import MilvusContainer
   from apache_beam.transforms.enrichment import Enrichment
+  from apache_beam.ml.rag.test_utils import (
+      MilvusTestHelpers, VectorDBContainerInfo)
   from apache_beam.ml.rag.enrichment.milvus_search import (
       MilvusSearchEnrichmentHandler,
       MilvusConnectionParameters,
@@ -243,241 +245,67 @@ def __getitem__(self, key):
 }
 
 
-@dataclass
-class MilvusDBContainerInfo:
-  container: DbContainer
-  host: str
-  port: int
-  user: Optional[str] = ""
-  password: Optional[str] = ""
-  token: Optional[str] = ""
-  id: Optional[str] = "default"
-
-  @property
-  def uri(self) -> str:
-    return f"http://{self.host}:{self.port}"
-
-
-class CustomMilvusContainer(MilvusContainer):
-  def __init__(
-      self,
-      image: str,
-      service_container_port,
-      healthcheck_container_port,
-      **kwargs,
-  ) -> None:
-    # Skip the parent class's constructor and go straight to
-    # GenericContainer.
-    super(MilvusContainer, self).__init__(image=image, **kwargs)
-    self.port = service_container_port
-    self.healthcheck_port = healthcheck_container_port
-    self.with_exposed_ports(service_container_port, healthcheck_container_port)
-
-    # Get free host ports.
-    service_host_port = MilvusEnrichmentTestHelper.find_free_port()
-    healthcheck_host_port = MilvusEnrichmentTestHelper.find_free_port()
-
-    # Bind container and host ports.
-    self.with_bind_ports(service_container_port, service_host_port)
-    self.with_bind_ports(healthcheck_container_port, healthcheck_host_port)
-    self.cmd = "milvus run standalone"
-
-    # Set environment variables needed for Milvus.
-    envs = {
-        "ETCD_USE_EMBED": "true",
-        "ETCD_DATA_DIR": "/var/lib/milvus/etcd",
-        "COMMON_STORAGETYPE": "local",
-        "METRICS_PORT": str(healthcheck_container_port)
-    }
-    for env, value in envs.items():
-      self.with_env(env, value)
-
-
-class MilvusEnrichmentTestHelper:
-  # IMPORTANT: When upgrading the Milvus server version, ensure the pymilvus
-  # Python SDK client in setup.py is updated to match. Referring to the Milvus
-  # release notes compatibility matrix at
-  # https://milvus.io/docs/release_notes.md or PyPI at
-  # https://pypi.org/project/pymilvus/ for version compatibility.
-  # Example: Milvus v2.6.0 requires pymilvus==2.6.0 (exact match required).
-  @staticmethod
-  def start_db_container(
-      image="milvusdb/milvus:v2.5.10",
-      max_vec_fields=5,
-      vector_client_max_retries=3,
-      tc_max_retries=None) -> Optional[MilvusDBContainerInfo]:
-    service_container_port = MilvusEnrichmentTestHelper.find_free_port()
-    healthcheck_container_port = MilvusEnrichmentTestHelper.find_free_port()
-    user_yaml_creator = MilvusEnrichmentTestHelper.create_user_yaml
-    with user_yaml_creator(service_container_port, max_vec_fields) as cfg:
-      info = None
-      original_tc_max_tries = testcontainers_config.max_tries
-      if testcontainers_config.max_tries is not None:
-        testcontainers_config.max_tries = tc_max_retries
-      for i in range(vector_client_max_retries):
-        try:
-          vector_db_container = CustomMilvusContainer(
-              image=image,
-              service_container_port=service_container_port,
-              healthcheck_container_port=healthcheck_container_port)
-          vector_db_container = vector_db_container.with_volume_mapping(
-              cfg, "/milvus/configs/user.yaml")
-          vector_db_container.start()
-          host = vector_db_container.get_container_host_ip()
-          port = vector_db_container.get_exposed_port(service_container_port)
-          info = MilvusDBContainerInfo(vector_db_container, host, port)
-          _LOGGER.info(
-              "milvus db container started successfully on %s.", info.uri)
-          break
-        except Exception as e:
-          stdout_logs, stderr_logs = vector_db_container.get_logs()
-          stdout_logs = stdout_logs.decode("utf-8")
-          stderr_logs = stderr_logs.decode("utf-8")
-          _LOGGER.warning(
-              "Retry %d/%d: Failed to start Milvus DB container. Reason: %s. "
-              "STDOUT logs:\n%s\nSTDERR logs:\n%s",
-              i + 1,
-              vector_client_max_retries,
-              e,
-              stdout_logs,
-              stderr_logs)
-          if i == vector_client_max_retries - 1:
-            _LOGGER.error(
-                "Unable to start milvus db container for I/O tests after %d "
-                "retries. Tests cannot proceed. STDOUT logs:\n%s\n"
-                "STDERR logs:\n%s",
-                vector_client_max_retries,
-                stdout_logs,
-                stderr_logs)
-            raise e
-        finally:
-          testcontainers_config.max_tries = original_tc_max_tries
-      return info
-
-  @staticmethod
-  def stop_db_container(db_info: MilvusDBContainerInfo):
-    if db_info is None:
-      _LOGGER.warning("Milvus db info is None. Skipping stop operation.")
-      return
-    _LOGGER.debug("Stopping milvus db container.")
-    db_info.container.stop()
-    _LOGGER.info("milvus db container stopped successfully.")
-
-  @staticmethod
-  def initialize_db_with_data(connc_params: MilvusConnectionParameters):
-    # Open the connection to the milvus db.
-    client = MilvusClient(**connc_params.__dict__)
-
-    # Configure schema.
-    field_schemas: List[FieldSchema] = cast(
-        List[FieldSchema], MILVUS_IT_CONFIG["fields"])
-    schema = CollectionSchema(
-        fields=field_schemas, functions=MILVUS_IT_CONFIG["functions"])
-
-    # Create collection with the schema.
-    collection_name = MILVUS_IT_CONFIG["collection_name"]
-    index_function: Callable[[], IndexParams] = cast(
-        Callable[[], IndexParams], MILVUS_IT_CONFIG["index"])
-    client.create_collection(
-        collection_name=collection_name,
-        schema=schema,
-        index_params=index_function())
-
-    # Assert that collection was created.
-    collection_error = f"Expected collection '{collection_name}' to be created."
-    assert client.has_collection(collection_name), collection_error
-
-    # Gather all fields we have excluding 'sparse_embedding_bm25' special field.
-    fields = list(map(lambda field: field.name, field_schemas))
-
-    # Prep data for indexing. Currently we can't insert sparse vectors for BM25
-    # sparse embedding field as it would be automatically generated by Milvus
-    # through the registered BM25 function.
-    data_ready_to_index = []
-    for doc in MILVUS_IT_CONFIG["corpus"]:
-      item = {}
-      for field in fields:
-        if field.startswith("dense_embedding"):
-          item[field] = doc["dense_embedding"]
-        elif field == "sparse_embedding_inner_product":
-          item[field] = doc["sparse_embedding"]
-        elif field == "sparse_embedding_bm25":
-          # It is automatically generated by Milvus from the content field.
-          continue
-        else:
-          item[field] = doc[field]
-      data_ready_to_index.append(item)
-
-    # Index data.
-    result = client.insert(
-        collection_name=collection_name, data=data_ready_to_index)
-
-    # Assert that the intended data has been properly indexed.
-    insertion_err = f'failed to insert the {result["insert_count"]} data points'
-    assert result["insert_count"] == len(data_ready_to_index), insertion_err
-
-    # Release the collection from memory. It will be loaded lazily when the
-    # enrichment handler is invoked.
-    client.release_collection(collection_name)
-
-    # Close the connection to the Milvus database, as no further preparation
-    # operations are needed  before executing the enrichment handler.
-    client.close()
-
-    return collection_name
-
-  @staticmethod
-  def find_free_port():
-    """Find a free port on the local machine."""
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-      # Bind to port 0, which asks OS to assign a free port.
-      s.bind(('', 0))
-      s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-      # Return the port number assigned by OS.
-      return s.getsockname()[1]
-
-  @staticmethod
-  @contextlib.contextmanager
-  def create_user_yaml(service_port: int, max_vector_field_num=5):
-    """Creates a temporary user.yaml file for Milvus configuration.
-
-      This user yaml file overrides Milvus default configurations. It sets
-      the Milvus service port to the specified container service port. The
-      default for maxVectorFieldNum is 4, but we need 5
-      (one unique field for each metric).
-
-      Args:
-        service_port: Port number for the Milvus service.
-        max_vector_field_num: Max number of vec fields allowed per collection.
-
-      Yields:
-          str: Path to the created temporary yaml file.
-      """
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
-                                     delete=False) as temp_file:
-      # Define the content for user.yaml.
-      user_config = {
-          'proxy': {
-              'maxVectorFieldNum': max_vector_field_num, 'port': service_port
-          },
-          'etcd': {
-              'use': {
-                  'embed': True
-              }, 'data': {
-                  'dir': '/var/lib/milvus/etcd'
-              }
-          }
-      }
-
-      # Write the content to the file.
-      yaml.dump(user_config, temp_file, default_flow_style=False)
-      path = temp_file.name
-
-    try:
-      yield path
-    finally:
-      if os.path.exists(path):
-        os.remove(path)
+def initialize_db_with_data(connc_params: MilvusConnectionParameters):
+  # Open the connection to the milvus db.
+  client = MilvusClient(**connc_params.__dict__)
+
+  # Configure schema.
+  field_schemas: List[FieldSchema] = cast(
+      List[FieldSchema], MILVUS_IT_CONFIG["fields"])
+  schema = CollectionSchema(
+      fields=field_schemas, functions=MILVUS_IT_CONFIG["functions"])
+
+  # Create collection with the schema.
+  collection_name = MILVUS_IT_CONFIG["collection_name"]
+  index_function: Callable[[], IndexParams] = cast(
+      Callable[[], IndexParams], MILVUS_IT_CONFIG["index"])
+  client.create_collection(
+      collection_name=collection_name,
+      schema=schema,
+      index_params=index_function())
+
+  # Assert that collection was created.
+  collection_error = f"Expected collection '{collection_name}' to be created."
+  assert client.has_collection(collection_name), collection_error
+
+  # Gather all fields we have excluding 'sparse_embedding_bm25' special field.
+  fields = list(map(lambda field: field.name, field_schemas))
+
+  # Prep data for indexing. Currently we can't insert sparse vectors for BM25
+  # sparse embedding field as it would be automatically generated by Milvus
+  # through the registered BM25 function.
+  data_ready_to_index = []
+  for doc in MILVUS_IT_CONFIG["corpus"]:
+    item = {}
+    for field in fields:
+      if field.startswith("dense_embedding"):
+        item[field] = doc["dense_embedding"]
+      elif field == "sparse_embedding_inner_product":
+        item[field] = doc["sparse_embedding"]
+      elif field == "sparse_embedding_bm25":
+        # It is automatically generated by Milvus from the content field.
+        continue
+      else:
+        item[field] = doc[field]
+    data_ready_to_index.append(item)
+
+  # Index data.
+  result = client.insert(
+      collection_name=collection_name, data=data_ready_to_index)
+
+  # Assert that the intended data has been properly indexed.
+  insertion_err = f'failed to insert the {result["insert_count"]} data points'
+  assert result["insert_count"] == len(data_ready_to_index), insertion_err
+
+  # Release the collection from memory. It will be loaded lazily when the
+  # enrichment handler is invoked.
+  client.release_collection(collection_name)
+
+  # Close the connection to the Milvus database, as no further preparation
+  # operations are needed  before executing the enrichment handler.
+  client.close()
+
+  return collection_name
 
 
 @pytest.mark.require_docker_in_docker
@@ -491,25 +319,23 @@ def create_user_yaml(service_port: int, max_vector_field_num=5):
 class TestMilvusSearchEnrichment(unittest.TestCase):
   """Tests for search functionality across all search strategies"""
 
-  _db: MilvusDBContainerInfo
+  _db: VectorDBContainerInfo
 
   @classmethod
   def setUpClass(cls):
-    cls._db = MilvusEnrichmentTestHelper.start_db_container()
+    cls._db = MilvusTestHelpers.start_db_container()
     cls._connection_params = MilvusConnectionParameters(
         uri=cls._db.uri,
         user=cls._db.user,
         password=cls._db.password,
-        db_id=cls._db.id,
-        token=cls._db.token,
-        timeout=60.0)  # Increase timeout to 60s for container startup
+        db_name=cls._db.id,
+        token=cls._db.token)
     cls._collection_load_params = MilvusCollectionLoadParameters()
-    cls._collection_name = MilvusEnrichmentTestHelper.initialize_db_with_data(
-        cls._connection_params)
+    cls._collection_name = initialize_db_with_data(cls._connection_params)
 
   @classmethod
   def tearDownClass(cls):
-    MilvusEnrichmentTestHelper.stop_db_container(cls._db)
+    MilvusTestHelpers.stop_db_container(cls._db)
     cls._db = None
 
   def test_invalid_query_on_non_existent_collection(self):
@@ -1244,37 +1070,6 @@ def test_hybrid_search(self):
           lambda actual: assert_chunks_equivalent(actual, expected_chunks))
 
 
-def parse_chunk_strings(chunk_str_list: List[str]) -> List[Chunk]:
-  parsed_chunks = []
-
-  # Define safe globals and disable built-in functions for safety.
-  safe_globals = {
-      'Chunk': Chunk,
-      'Content': Content,
-      'Embedding': Embedding,
-      'defaultdict': defaultdict,
-      'list': list,
-      '__builtins__': {}
-  }
-
-  for raw_str in chunk_str_list:
-    try:
-      # replace "<class 'list'>" with actual list reference.
-      cleaned_str = re.sub(
-          r"defaultdict\(<class 'list'>", "defaultdict(list", raw_str)
-
-      # Evaluate string in restricted environment.
-      chunk = eval(cleaned_str, safe_globals)  # pylint: disable=eval-used
-      if isinstance(chunk, Chunk):
-        parsed_chunks.append(chunk)
-      else:
-        raise ValueError("Parsed object is not a Chunk instance")
-    except Exception as e:
-      raise ValueError(f"Error parsing string:\n{raw_str}\n{e}")
-
-  return parsed_chunks
-
-
 def assert_chunks_equivalent(
     actual_chunks: List[Chunk], expected_chunks: List[Chunk]):
   """assert_chunks_equivalent checks for presence rather than exact match"""
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
new file mode 100644
index 000000000000..041349efeb77
--- /dev/null
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
@@ -0,0 +1,340 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, NamedTuple, Optional
+
+from pymilvus import MilvusClient
+
+import logging
+
+import apache_beam as beam
+from apache_beam.ml.rag.ingestion.base import VectorDatabaseWriteConfig
+from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpec
+from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpecsBuilder
+from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.rag.utils import (
+    MilvusHelpers, unpack_dataclass_with_kwargs, DEFAULT_WRITE_BATCH_SIZE)
+from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
+from apache_beam.transforms import DoFn
+
+from apache_beam.ml.rag.utils import MilvusConnectionParameters
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@dataclass
+class MilvusWriteConfig:
+  """Configuration parameters for writing data to Milvus collections.
+
+  This class defines the parameters needed to write data to a Milvus collection,
+  including collection targeting, batching behavior, and operation timeouts.
+
+  Args:
+    collection_name: Name of the target Milvus collection to write data to.
+      Must be a non-empty string.
+    partition_name: Name of the specific partition within the collection to
+      write to. If empty, writes to the default partition.
+    timeout: Maximum time in seconds to wait for write operations to complete.
+      If None, uses the client's default timeout.
+    write_config: Configuration for write operations including batch size and
+      other write-specific settings.
+    kwargs: Additional keyword arguments for write operations. Enables forward
+      compatibility with future Milvus client parameters.
+  """
+  collection_name: str
+  partition_name: str = ""
+  timeout: Optional[float] = None
+  write_config: WriteConfig = field(default_factory=WriteConfig)
+  kwargs: Dict[str, Any] = field(default_factory=dict)
+
+  def __post_init__(self):
+    if not self.collection_name:
+      raise ValueError("Collection name must be provided")
+
+  @property
+  def write_batch_size(self):
+    """Returns the batch size for write operations.
+
+    Returns:
+      The configured batch size, or DEFAULT_WRITE_BATCH_SIZE if not specified.
+    """
+    return self.write_config.write_batch_size or DEFAULT_WRITE_BATCH_SIZE
+
+
+@dataclass
+class MilvusVectorWriterConfig(VectorDatabaseWriteConfig):
+  """Configuration for writing vector data to Milvus collections.
+
+  This class extends VectorDatabaseWriteConfig to provide Milvus-specific
+  configuration for ingesting vector embeddings and associated metadata.
+  It defines how Apache Beam chunks are converted to Milvus records and
+  handles the write operation parameters.
+
+  The configuration includes connection parameters, write settings, and
+  column specifications that determine how chunk data is mapped to Milvus
+  fields.
+
+  Args:
+    connection_params: Configuration for connecting to the Milvus server,
+      including URI, credentials, and connection options.
+    write_config: Configuration for write operations including collection name,
+      partition, batch size, and timeouts.
+    column_specs: List of column specifications defining how chunk fields are
+      mapped to Milvus collection fields. Defaults to standard RAG fields
+      (id, embedding, sparse_embedding, content, metadata).
+
+  Example:
+    config = MilvusVectorWriterConfig(
+      connection_params=MilvusConnectionParameters(
+        uri="http://localhost:19530"),
+      write_config=MilvusWriteConfig(collection_name="my_collection"),
+      column_specs=MilvusVectorWriterConfig.default_column_specs())
+  """
+  connection_params: MilvusConnectionParameters
+  write_config: MilvusWriteConfig
+  column_specs: List[ColumnSpec] = field(
+      default_factory=lambda: MilvusVectorWriterConfig.default_column_specs())
+
+  def create_converter(self) -> Callable[[Chunk], Dict[str, Any]]:
+    """Creates a function to convert Apache Beam Chunks to Milvus records.
+
+    Returns:
+      A function that takes a Chunk and returns a dictionary representing
+      a Milvus record with fields mapped according to column_specs.
+    """
+    """Creates a function to convert Chunks to records."""
+    def convert(chunk: Chunk) -> Dict[str, Any]:
+      result = {}
+      for col in self.column_specs:
+        result[col.column_name] = col.value_fn(chunk)
+      return result
+
+    return convert
+
+  def create_write_transform(self) -> beam.PTransform:
+    """Creates the Apache Beam transform for writing to Milvus.
+
+    Returns:
+      A PTransform that can be applied to a PCollection of Chunks to write
+      them to the configured Milvus collection.
+    """
+    return _WriteToMilvusVectorDatabase(self)
+
+  @staticmethod
+  def default_column_specs() -> List[ColumnSpec]:
+    """Returns default column specifications for RAG use cases.
+
+    Creates column mappings for standard RAG fields: id, dense embedding,
+    sparse embedding, content text, and metadata. These specifications
+    define how Chunk fields are converted to Milvus-compatible formats.
+
+    Returns:
+      List of ColumnSpec objects defining the default field mappings.
+    """
+    column_specs = ColumnSpecsBuilder()
+    return column_specs\
+      .with_id_spec()\
+      .with_embedding_spec(convert_fn=lambda values: list(values))\
+      .with_sparse_embedding_spec(conv_fn=MilvusHelpers.sparse_embedding)\
+      .with_content_spec()\
+      .with_metadata_spec(convert_fn=lambda values: dict(values))\
+      .build()
+
+
+class _WriteToMilvusVectorDatabase(beam.PTransform):
+  """Apache Beam PTransform for writing vector data to Milvus.
+
+  This transform handles the conversion of Apache Beam Chunks to Milvus records
+  and coordinates the write operations. It applies the configured converter
+  function and uses a DoFn for batched writes to optimize performance.
+
+  Args:
+    config: MilvusVectorWriterConfig containing all necessary parameters for
+      the write operation.
+  """
+  def __init__(self, config: MilvusVectorWriterConfig):
+    self.config = config
+
+  def expand(self, pcoll: beam.PCollection[Chunk]):
+    """Expands the PTransform to convert chunks and write to Milvus.
+
+    Args:
+      pcoll: PCollection of Chunk objects to write to Milvus.
+
+    Returns:
+      PCollection of the same Chunk objects after writing to Milvus.
+    """
+    return (
+        pcoll
+        | "Convert to Records" >> beam.Map(self.config.create_converter())
+        | beam.ParDo(
+            _WriteMilvusFn(
+                self.config.connection_params, self.config.write_config)))
+
+
+class _WriteMilvusFn(DoFn):
+  """DoFn that handles batched writes to Milvus.
+
+  This DoFn accumulates records in batches and flushes them to Milvus when
+  the batch size is reached or when the bundle finishes. This approach
+  optimizes performance by reducing the number of individual write operations.
+
+  Args:
+    connection_params: Configuration for connecting to the Milvus server.
+    write_config: Configuration for write operations including batch size
+      and collection details.
+  """
+  def __init__(
+      self,
+      connection_params: MilvusConnectionParameters,
+      write_config: MilvusWriteConfig):
+    self._connection_params = connection_params
+    self._write_config = write_config
+    self.batch = []
+
+  def process(self, element, *args, **kwargs):
+    """Processes individual records, batching them for efficient writes.
+
+    Args:
+      element: A dictionary representing a Milvus record to write.
+      *args: Additional positional arguments.
+      **kwargs: Additional keyword arguments.
+
+    Yields:
+      The original element after adding it to the batch.
+    """
+    _ = args, kwargs  # Unused parameters
+    self.batch.append(element)
+    if len(self.batch) >= self._write_config.write_batch_size:
+      self._flush()
+    yield element
+
+  def finish_bundle(self):
+    """Called when a bundle finishes processing.
+
+    Flushes any remaining records in the batch to ensure all data is written.
+    """
+    self._flush()
+
+  def _flush(self):
+    """Flushes the current batch of records to Milvus.
+
+    Creates a MilvusSink connection and writes all batched records,
+    then clears the batch for the next set of records.
+    """
+    if len(self.batch) == 0:
+      return
+    with _MilvusSink(self._connection_params, self._write_config) as sink:
+      sink.write(self.batch)
+      self.batch = []
+
+  def display_data(self):
+    """Returns display data for monitoring and debugging.
+
+    Returns:
+      Dictionary containing database, collection, and batch size information
+      for display in the Apache Beam monitoring UI.
+    """
+    res = super().display_data()
+    res["database"] = self._connection_params.db_name
+    res["collection"] = self._write_config.collection_name
+    res["batch_size"] = self._write_config.write_batch_size
+    return res
+
+
+class _MilvusSink:
+  """Low-level sink for writing data directly to Milvus.
+
+  This class handles the direct interaction with the Milvus client for
+  upsert operations. It manages the connection lifecycle and provides
+  context manager support for proper resource cleanup.
+
+  Args:
+    connection_params: Configuration for connecting to the Milvus server.
+    write_config: Configuration for write operations including collection
+      and partition targeting.
+  """
+  def __init__(
+      self,
+      connection_params: MilvusConnectionParameters,
+      write_config: MilvusWriteConfig):
+    self._connection_params = connection_params
+    self._write_config = write_config
+    self._client = None
+
+  def write(self, documents):
+    """Writes a batch of documents to the Milvus collection.
+
+    Performs an upsert operation to insert new documents or update existing
+    ones based on primary key. After the upsert, flushes the collection to
+    ensure data persistence.
+
+    Args:
+      documents: List of dictionaries representing Milvus records to write.
+        Each dictionary should contain fields matching the collection schema.
+    """
+    if not self._client:
+      self._client = MilvusClient(
+          **unpack_dataclass_with_kwargs(self._connection_params))
+
+    try:
+      resp = self._client.upsert(
+          collection_name=self._write_config.collection_name,
+          partition_name=self._write_config.partition_name,
+          data=documents,
+          timeout=self._write_config.timeout,
+          **self._write_config.kwargs)
+
+      # Try to flush, but handle connection issues gracefully.
+      try:
+        self._client.flush(self._write_config.collection_name)
+      except Exception as e:
+        # If flush fails due to connection issues, log but don't fail the write.
+        _LOGGER.warning(
+            "Flush operation failed, but upsert was successful: %s", e)
+
+      _LOGGER.debug(
+          "Upserted into Milvus: upsert_count=%d, cost=%d",
+          resp.get("upsert_count", 0),
+          resp.get("cost", 0))
+    except Exception as e:
+      _LOGGER.error("Failed to write to Milvus: %s", e)
+      raise
+
+  def __enter__(self):
+    """Enters the context manager and establishes Milvus connection.
+
+    Returns:
+      Self, enabling use in 'with' statements.
+    """
+    if not self._client:
+      self._client = MilvusClient(
+          **unpack_dataclass_with_kwargs(self._connection_params))
+    return self
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    """Exits the context manager and closes the Milvus connection.
+
+    Args:
+      exc_type: Exception type if an exception was raised.
+      exc_val: Exception value if an exception was raised.
+      exc_tb: Exception traceback if an exception was raised.
+    """
+    _ = exc_type, exc_val, exc_tb  # Unused parameters
+    if self._client:
+      self._client.close()
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
new file mode 100644
index 000000000000..f8f01d9d5964
--- /dev/null
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -0,0 +1,616 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import platform
+from typing import Callable, cast
+import unittest
+import uuid
+
+import pytest
+from pymilvus import CollectionSchema, DataType, MilvusClient
+from pymilvus import FieldSchema
+from pymilvus.milvus_client import IndexParams
+
+import apache_beam as beam
+
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.rag.types import Content
+from apache_beam.ml.rag.types import Embedding
+from apache_beam.ml.rag.utils import MilvusConnectionParameters
+from apache_beam.ml.rag.test_utils import (
+    VectorDBContainerInfo, MilvusTestHelpers)
+from apache_beam.testing.test_pipeline import TestPipeline
+from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
+from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
+
+try:
+  from apache_beam.ml.rag.ingestion.milvus_search import (
+      MilvusWriteConfig, MilvusVectorWriterConfig)
+except ImportError as e:
+  raise unittest.SkipTest(f'Milvus dependencies not installed: {str(e)}')
+
+
+def _construct_index_params():
+  index_params = IndexParams()
+
+  # Dense vector index for dense embeddings.
+  index_params.add_index(
+      field_name="embedding",
+      index_name="embedding_ivf_flat",
+      index_type="IVF_FLAT",
+      metric_type="COSINE",
+      params={"nlist": 1})
+
+  # Sparse vector index for sparse embeddings.
+  index_params.add_index(
+      field_name="sparse_embedding",
+      index_name="sparse_embedding_inverted_index",
+      index_type="SPARSE_INVERTED_INDEX",
+      metric_type="IP",
+      params={"inverted_index_algo": "TAAT_NAIVE"})
+
+  return index_params
+
+
+MILVUS_INGESTION_IT_CONFIG = {
+    "fields": [
+        FieldSchema(
+            name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
+        FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=1000),
+        FieldSchema(name="metadata", dtype=DataType.JSON),
+        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=3),
+        FieldSchema(
+            name="sparse_embedding", dtype=DataType.SPARSE_FLOAT_VECTOR)
+    ],
+    "index": _construct_index_params,
+    "corpus": [
+        Chunk(
+            id=1,
+            content=Content(text="Test document one"),
+            metadata={"source": "test1"},
+            embedding=Embedding(
+                dense_embedding=[0.1, 0.2, 0.3],
+                sparse_embedding=([1, 2], [0.1, 0.2])),
+        ),
+        Chunk(
+            id=2,
+            content=Content(text="Test document two"),
+            metadata={"source": "test2"},
+            embedding=Embedding(
+                dense_embedding=[0.2, 0.3, 0.4],
+                sparse_embedding=([2, 3], [0.3, 0.1]),
+            ),
+        ),
+        Chunk(
+            id=3,
+            content=Content(text="Test document three"),
+            metadata={"source": "test3"},
+            embedding=Embedding(
+                dense_embedding=[0.3, 0.4, 0.5],
+                sparse_embedding=([3, 4], [0.4, 0.2]),
+            ),
+        )
+    ]
+}
+
+
+def create_collection_with_partition(
+    client: MilvusClient,
+    collection_name: str,
+    partition_name: str = '',
+    fields=MILVUS_INGESTION_IT_CONFIG["fields"]):
+  # Configure schema.
+  schema = CollectionSchema(fields=fields)
+
+  # Configure index.
+  index_function: Callable[[], IndexParams] = cast(
+      Callable[[], IndexParams], MILVUS_INGESTION_IT_CONFIG["index"])
+
+  # Create collection with schema.
+  client.create_collection(
+      collection_name=collection_name,
+      schema=schema,
+      index_params=index_function())
+
+  # Create partition within the collection.
+  client.create_partition(
+      collection_name=collection_name, partition_name=partition_name)
+
+  msg = f"Expected collection '{collection_name}' to be created."
+  assert client.has_collection(collection_name), msg
+
+  msg = f"Expected partition '{partition_name}' to be created."
+  assert client.has_partition(collection_name, partition_name), msg
+
+  # Release the collection from memory. We don't need that on pure writing.
+  client.release_collection(collection_name)
+
+
+def drop_collection(client: MilvusClient, collection_name: str):
+  try:
+    client.drop_collection(collection_name)
+    assert not client.has_collection(collection_name)
+  except Exception:
+    # Silently ignore connection errors during cleanup.
+    pass
+
+
+@pytest.mark.uses_testcontainer
+@unittest.skipUnless(
+    platform.system() == "Linux",
+    "Test runs only on Linux due to lack of support, as yet, for nested "
+    "virtualization in CI environments on Windows/macOS. Many CI providers run "
+    "tests in virtualized environments, and nested virtualization "
+    "(Docker inside a VM) is either unavailable or has several issues on "
+    "non-Linux platforms.")
+class TestMilvusVectorWriterConfig(unittest.TestCase):
+  """Integration tests for Milvus vector database ingestion functionality"""
+
+  _db: VectorDBContainerInfo
+  _version = "milvusdb/milvus:v2.5.10"
+
+  @classmethod
+  def setUpClass(cls):
+    cls._db = MilvusTestHelpers.start_db_container(
+        cls._version, vector_client_max_retries=3)
+    cls._connection_config = MilvusConnectionParameters(
+        uri=cls._db.uri,
+        user=cls._db.user,
+        password=cls._db.password,
+        db_name=cls._db.id,
+        token=cls._db.token)
+
+  @classmethod
+  def tearDownClass(cls):
+    MilvusTestHelpers.stop_db_container(cls._db)
+    cls._db = None
+
+  def setUp(self):
+    self.write_test_pipeline = TestPipeline()
+    self.write_test_pipeline.not_use_test_runner_api = True
+    self._collection_name = f"test_collection_{self._testMethodName}"
+    self._partition_name = f"test_partition_{self._testMethodName}"
+    config = unpack_dataclass_with_kwargs(self._connection_config)
+    config["alias"] = f"milvus_conn_{uuid.uuid4().hex[:8]}"
+    self._test_client = MilvusClient(**config)
+    create_collection_with_partition(
+        self._test_client, self._collection_name, self._partition_name)
+
+  def tearDown(self):
+    drop_collection(self._test_client, self._collection_name)
+    self._test_client.close()
+
+  def test_invalid_write_on_non_existent_collection(self):
+    non_existent_collection = "nonexistent_collection"
+
+    test_chunks = MILVUS_INGESTION_IT_CONFIG["corpus"]
+
+    write_config = MilvusWriteConfig(
+        collection_name=non_existent_collection,
+        write_config=WriteConfig(write_batch_size=1))
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config,
+        write_config=write_config,
+    )
+
+    # Write pipeline.
+    with self.assertRaises(Exception) as context:
+      with TestPipeline() as p:
+        _ = (p | beam.Create(test_chunks) | config.create_write_transform())
+
+    # Assert on what should happen.
+    self.assertIn("can't find collection", str(context.exception).lower())
+
+  def test_invalid_write_on_non_existent_partition(self):
+    non_existent_partition = "nonexistent_partition"
+
+    test_chunks = MILVUS_INGESTION_IT_CONFIG["corpus"]
+
+    write_config = MilvusWriteConfig(
+        collection_name=self._collection_name,
+        partition_name=non_existent_partition,
+        write_config=WriteConfig(write_batch_size=1))
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config, write_config=write_config)
+
+    # Write pipeline.
+    with self.assertRaises(Exception) as context:
+      with TestPipeline() as p:
+        _ = (p | beam.Create(test_chunks) | config.create_write_transform())
+
+    # Assert on what should happen.
+    self.assertIn("partition not found", str(context.exception).lower())
+
+  def test_invalid_write_on_missing_primary_key_in_entity(self):
+    test_chunks = [
+        Chunk(
+            content=Content(text="Test content without ID"),
+            embedding=Embedding(
+                dense_embedding=[0.1, 0.2, 0.3],
+                sparse_embedding=([1, 2], [0.1, 0.2])),
+            metadata={"source": "test"})
+    ]
+
+    write_config = MilvusWriteConfig(
+        collection_name=self._collection_name,
+        partition_name=self._partition_name,
+        write_config=WriteConfig(write_batch_size=1))
+
+    # Deliberately remove id primary key from the entity.
+    specs = MilvusVectorWriterConfig.default_column_specs()
+    for i, spec in enumerate(specs):
+      if spec.column_name == "id":
+        del specs[i]
+        break
+
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config,
+        write_config=write_config,
+        column_specs=specs)
+
+    # Write pipeline.
+    with self.assertRaises(Exception) as context:
+      with TestPipeline() as p:
+        _ = (p | beam.Create(test_chunks) | config.create_write_transform())
+
+    # Assert on what should happen.
+    self.assertIn(
+        "insert missed an field `id` to collection",
+        str(context.exception).lower())
+
+  def test_write_on_auto_id_primary_key(self):
+    auto_id_collection = f"auto_id_collection_{self._testMethodName}"
+    auto_id_partition = f"auto_id_partition_{self._testMethodName}"
+    auto_id_fields = [
+        FieldSchema(
+            name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+        FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=1000),
+        FieldSchema(name="metadata", dtype=DataType.JSON),
+        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=3),
+        FieldSchema(
+            name="sparse_embedding", dtype=DataType.SPARSE_FLOAT_VECTOR)
+    ]
+
+    # Create collection with an auto id field.
+    create_collection_with_partition(
+        client=self._test_client,
+        collection_name=auto_id_collection,
+        partition_name=auto_id_partition,
+        fields=auto_id_fields)
+
+    test_chunks = [
+        Chunk(
+            id=1,
+            content=Content(text="Test content without ID"),
+            embedding=Embedding(
+                dense_embedding=[0.1, 0.2, 0.3],
+                sparse_embedding=([1, 2], [0.1, 0.2])),
+            metadata={"source": "test"})
+    ]
+
+    write_config = MilvusWriteConfig(
+        collection_name=auto_id_collection,
+        partition_name=auto_id_partition,
+        write_config=WriteConfig(write_batch_size=1))
+
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config, write_config=write_config)
+
+    with self.write_test_pipeline as p:
+      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
+
+    self._test_client.load_collection(auto_id_collection)
+    result = self._test_client.query(
+        collection_name=auto_id_collection,
+        partition_names=[auto_id_partition],
+        limit=3)
+
+    # Test there is only one item in the result and the ID is not equal to one.
+    self.assertEqual(len(result), len(test_chunks))
+    result_item = dict(result[0])
+    self.assertNotEqual(result_item["id"], 1)
+
+  def test_write_on_existent_collection_with_default_schema(self):
+    test_chunks = MILVUS_INGESTION_IT_CONFIG["corpus"]
+
+    write_config = MilvusWriteConfig(
+        collection_name=self._collection_name,
+        partition_name=self._partition_name,
+        write_config=WriteConfig(write_batch_size=3))
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config, write_config=write_config)
+
+    with self.write_test_pipeline as p:
+      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
+
+    # Verify data was written successfully.
+    self._test_client.load_collection(self._collection_name)
+    result = self._test_client.query(
+        collection_name=self._collection_name,
+        partition_names=[self._partition_name],
+        limit=10)
+
+    self.assertEqual(len(result), len(test_chunks))
+
+    # Verify each chunk was written correctly.
+    result_by_id = {item["id"]: item for item in result}
+    for chunk in test_chunks:
+      self.assertIn(chunk.id, result_by_id)
+      result_item = result_by_id[chunk.id]
+      self.assertEqual(
+          result_item["content"],
+          chunk.content.text
+          if hasattr(chunk.content, 'text') else chunk.content)
+      self.assertEqual(result_item["metadata"], chunk.metadata)
+
+      # Verify embedding is present and has correct length.
+      expected_embedding = chunk.embedding.dense_embedding
+      actual_embedding = result_item["embedding"]
+      self.assertIsNotNone(actual_embedding)
+      self.assertEqual(len(actual_embedding), len(expected_embedding))
+
+  def test_write_with_custom_column_specifications(self):
+    from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpec
+    from apache_beam.ml.rag.utils import MilvusHelpers
+
+    custom_column_specs = [
+        ColumnSpec("id", int, lambda chunk: int(chunk.id) if chunk.id else 0),
+        ColumnSpec(
+            "content",
+            str, lambda chunk: (
+                chunk.content.text
+                if hasattr(chunk.content, 'text') else chunk.content)),
+        ColumnSpec("metadata", dict, lambda chunk: chunk.metadata or {}),
+        ColumnSpec(
+            "embedding",
+            list, lambda chunk: chunk.embedding.dense_embedding or []),
+        ColumnSpec(
+            "sparse_embedding",
+            dict, lambda chunk: (
+                MilvusHelpers.sparse_embedding(
+                    chunk.embedding.sparse_embedding) if chunk.embedding and
+                chunk.embedding.sparse_embedding else {}))
+    ]
+
+    test_chunks = [
+        Chunk(
+            id=10,
+            content=Content(text="Custom column spec test"),
+            embedding=Embedding(
+                dense_embedding=[0.8, 0.9, 1.0],
+                sparse_embedding=([1, 3, 5], [0.8, 0.9, 1.0])),
+            metadata={"custom": "spec_test"})
+    ]
+
+    write_config = MilvusWriteConfig(
+        collection_name=self._collection_name,
+        partition_name=self._partition_name,
+        write_config=WriteConfig(write_batch_size=1))
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config,
+        write_config=write_config,
+        column_specs=custom_column_specs)
+
+    with self.write_test_pipeline as p:
+      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
+
+    # Verify data was written successfully.
+    self._test_client.load_collection(self._collection_name)
+    result = self._test_client.query(
+        collection_name=self._collection_name,
+        partition_names=[self._partition_name],
+        filter="id == 10",
+        limit=1)
+
+    self.assertEqual(len(result), 1)
+    result_item = result[0]
+
+    # Verify custom column specs worked correctly.
+    self.assertEqual(result_item["id"], 10)
+    self.assertEqual(result_item["content"], "Custom column spec test")
+    self.assertEqual(result_item["metadata"], {"custom": "spec_test"})
+
+    # Verify embedding is present and has correct length.
+    expected_embedding = [0.8, 0.9, 1.0]
+    actual_embedding = result_item["embedding"]
+    self.assertIsNotNone(actual_embedding)
+    self.assertEqual(len(actual_embedding), len(expected_embedding))
+
+    # Verify sparse embedding was converted correctly - check keys are present.
+    expected_sparse_keys = {1, 3, 5}
+    actual_sparse = result_item["sparse_embedding"]
+    self.assertIsNotNone(actual_sparse)
+    self.assertEqual(set(actual_sparse.keys()), expected_sparse_keys)
+
+  def test_write_with_batching(self):
+    test_chunks = [
+        Chunk(
+            id=i,
+            content=Content(text=f"Batch test document {i}"),
+            embedding=Embedding(
+                dense_embedding=[0.1 * i, 0.2 * i, 0.3 * i],
+                sparse_embedding=([i, i + 1], [0.1 * i, 0.2 * i])),
+            metadata={"batch_id": i}) for i in range(1, 8)  # 7 chunks
+    ]
+
+    # Set small batch size to force batching (7 chunks with batch size 3).
+    batch_write_config = WriteConfig(write_batch_size=3)
+    write_config = MilvusWriteConfig(
+        collection_name=self._collection_name,
+        partition_name=self._partition_name,
+        write_config=batch_write_config)
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config, write_config=write_config)
+
+    with self.write_test_pipeline as p:
+      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
+
+    # Verify all data was written successfully.
+    self._test_client.load_collection(self._collection_name)
+    result = self._test_client.query(
+        collection_name=self._collection_name,
+        partition_names=[self._partition_name],
+        limit=10)
+
+    self.assertEqual(len(result), len(test_chunks))
+
+    # Verify each batch was written correctly.
+    result_by_id = {item["id"]: item for item in result}
+    for chunk in test_chunks:
+      self.assertIn(chunk.id, result_by_id)
+      result_item = result_by_id[chunk.id]
+
+      # Verify content and metadata.
+      self.assertEqual(result_item["content"], chunk.content.text)
+      self.assertEqual(result_item["metadata"], chunk.metadata)
+
+      # Verify embeddings are present and have correct length.
+      expected_embedding = chunk.embedding.dense_embedding
+      actual_embedding = result_item["embedding"]
+      self.assertIsNotNone(actual_embedding)
+      self.assertEqual(len(actual_embedding), len(expected_embedding))
+
+      # Verify sparse embedding keys are present.
+      expected_sparse_keys = {chunk.id, chunk.id + 1}
+      actual_sparse = result_item["sparse_embedding"]
+      self.assertIsNotNone(actual_sparse)
+      self.assertEqual(set(actual_sparse.keys()), expected_sparse_keys)
+
+  def test_idempotent_write(self):
+    # Step 1: Insert initial data that doesn't exist.
+    initial_chunks = [
+        Chunk(
+            id=100,
+            content=Content(text="Initial document"),
+            embedding=Embedding(
+                dense_embedding=[1.0, 2.0, 3.0],
+                sparse_embedding=([100, 101], [1.0, 2.0])),
+            metadata={"version": 1}),
+        Chunk(
+            id=200,
+            content=Content(text="Another initial document"),
+            embedding=Embedding(
+                dense_embedding=[2.0, 3.0, 4.0],
+                sparse_embedding=([200, 201], [2.0, 3.0])),
+            metadata={"version": 1})
+    ]
+
+    write_config = MilvusWriteConfig(
+        collection_name=self._collection_name,
+        partition_name=self._partition_name,
+        write_config=WriteConfig(write_batch_size=2))
+    config = MilvusVectorWriterConfig(
+        connection_params=self._connection_config, write_config=write_config)
+
+    # Insert initial data.
+    with TestPipeline() as p:
+      p.not_use_test_runner_api = True
+      _ = (
+          p | "Create initial" >> beam.Create(initial_chunks)
+          | "Write initial" >> config.create_write_transform())
+
+    # Verify initial data was inserted (not existed before).
+    self._test_client.load_collection(self._collection_name)
+    result = self._test_client.query(
+        collection_name=self._collection_name,
+        partition_names=[self._partition_name],
+        limit=10)
+
+    self.assertEqual(len(result), 2)
+    result_by_id = {item["id"]: item for item in result}
+
+    # Verify initial state.
+    self.assertEqual(result_by_id[100]["content"], "Initial document")
+    self.assertEqual(result_by_id[100]["metadata"], {"version": 1})
+    self.assertEqual(result_by_id[200]["content"], "Another initial document")
+    self.assertEqual(result_by_id[200]["metadata"], {"version": 1})
+
+    # Step 2: Update existing data (same IDs, different content).
+    updated_chunks = [
+        Chunk(
+            id=100,
+            content=Content(text="Updated document"),
+            embedding=Embedding(
+                dense_embedding=[1.1, 2.1, 3.1],
+                sparse_embedding=([100, 102], [1.1, 2.1])),
+            metadata={"version": 2}),
+        Chunk(
+            id=200,
+            content=Content(text="Another updated document"),
+            embedding=Embedding(
+                dense_embedding=[2.1, 3.1, 4.1],
+                sparse_embedding=([200, 202], [2.1, 3.1])),
+            metadata={"version": 2})
+    ]
+
+    # Perform first update.
+    with TestPipeline() as p:
+      p.not_use_test_runner_api = True
+      _ = (
+          p | "Create update1" >> beam.Create(updated_chunks)
+          | "Write update1" >> config.create_write_transform())
+
+    # Verify update worked.
+    self._test_client.load_collection(self._collection_name)
+    result = self._test_client.query(
+        collection_name=self._collection_name,
+        partition_names=[self._partition_name],
+        limit=10)
+
+    self.assertEqual(len(result), 2)  # Still only 2 records.
+    result_by_id = {item["id"]: item for item in result}
+
+    # Verify updated state.
+    self.assertEqual(result_by_id[100]["content"], "Updated document")
+    self.assertEqual(result_by_id[100]["metadata"], {"version": 2})
+    self.assertEqual(result_by_id[200]["content"], "Another updated document")
+    self.assertEqual(result_by_id[200]["metadata"], {"version": 2})
+
+    # Step 3: Repeat the same update operation 3 more times (idempotence test).
+    for i in range(3):
+      with TestPipeline() as p:
+        p.not_use_test_runner_api = True
+        _ = (
+            p | f"Create repeat{i+2}" >> beam.Create(updated_chunks)
+            | f"Write repeat{i+2}" >> config.create_write_transform())
+
+      # Verify state hasn't changed after repeated updates.
+      self._test_client.load_collection(self._collection_name)
+      result = self._test_client.query(
+          collection_name=self._collection_name,
+          partition_names=[self._partition_name],
+          limit=10)
+
+      # Still only 2 records.
+      self.assertEqual(len(result), 2)
+      result_by_id = {item["id"]: item for item in result}
+
+      # Final state should remain unchanged.
+      self.assertEqual(result_by_id[100]["content"], "Updated document")
+      self.assertEqual(result_by_id[100]["metadata"], {"version": 2})
+      self.assertEqual(result_by_id[200]["content"], "Another updated document")
+      self.assertEqual(result_by_id[200]["metadata"], {"version": 2})
+
+      # Verify embeddings are still correct.
+      self.assertIsNotNone(result_by_id[100]["embedding"])
+      self.assertEqual(len(result_by_id[100]["embedding"]), 3)
+      self.assertIsNotNone(result_by_id[200]["embedding"])
+      self.assertEqual(len(result_by_id[200]["embedding"]), 3)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
new file mode 100644
index 000000000000..37b05c2e2409
--- /dev/null
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
@@ -0,0 +1,122 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+from parameterized import parameterized
+
+try:
+  from apache_beam.ml.rag.ingestion.milvus_search import (
+      MilvusWriteConfig, MilvusVectorWriterConfig)
+  from apache_beam.ml.rag.utils import MilvusConnectionParameters
+except ImportError as e:
+  raise unittest.SkipTest(f'Milvus dependencies not installed: {str(e)}')
+
+
+class TestMilvusWriteConfig(unittest.TestCase):
+  """Unit tests for MilvusWriteConfig validation errors."""
+  def test_empty_collection_name_raises_error(self):
+    """Test that empty collection name raises ValueError."""
+    with self.assertRaises(ValueError) as context:
+      MilvusWriteConfig(collection_name="")
+
+    self.assertIn("Collection name must be provided", str(context.exception))
+
+  def test_none_collection_name_raises_error(self):
+    """Test that None collection name raises ValueError."""
+    with self.assertRaises(ValueError) as context:
+      MilvusWriteConfig(collection_name=None)  # type: ignore[arg-type]
+
+    self.assertIn("Collection name must be provided", str(context.exception))
+
+
+class TestMilvusVectorWriterConfig(unittest.TestCase):
+  """Unit tests for MilvusVectorWriterConfig validation and functionality."""
+  def test_valid_config_creation(self):
+    """Test creation of valid MilvusVectorWriterConfig."""
+    connection_params = MilvusConnectionParameters(uri="http://localhost:19530")
+    write_config = MilvusWriteConfig(collection_name="test_collection")
+
+    config = MilvusVectorWriterConfig(
+        connection_params=connection_params, write_config=write_config)
+
+    self.assertEqual(config.connection_params, connection_params)
+    self.assertEqual(config.write_config, write_config)
+    self.assertIsNotNone(config.column_specs)
+
+  def test_create_converter_returns_callable(self):
+    """Test that create_converter returns a callable function."""
+    connection_params = MilvusConnectionParameters(uri="http://localhost:19530")
+    write_config = MilvusWriteConfig(collection_name="test_collection")
+
+    config = MilvusVectorWriterConfig(
+        connection_params=connection_params, write_config=write_config)
+
+    converter = config.create_converter()
+    self.assertTrue(callable(converter))
+
+  def test_create_write_transform_returns_ptransform(self):
+    """Test that create_write_transform returns a PTransform."""
+    connection_params = MilvusConnectionParameters(uri="http://localhost:19530")
+    write_config = MilvusWriteConfig(collection_name="test_collection")
+
+    config = MilvusVectorWriterConfig(
+        connection_params=connection_params, write_config=write_config)
+
+    transform = config.create_write_transform()
+    self.assertIsNotNone(transform)
+
+  def test_default_column_specs_has_expected_fields(self):
+    """Test that default column specs include expected fields."""
+    column_specs = MilvusVectorWriterConfig.default_column_specs()
+
+    self.assertIsInstance(column_specs, list)
+    self.assertGreater(len(column_specs), 0)
+
+    column_names = [spec.column_name for spec in column_specs]
+    expected_fields = [
+        "id", "embedding", "sparse_embedding", "content", "metadata"
+    ]
+
+    for field in expected_fields:
+      self.assertIn(field, column_names)
+
+  @parameterized.expand([
+      # Invalid connection parameters - empty URI.
+      (
+          lambda: (
+              MilvusConnectionParameters(uri=""), MilvusWriteConfig(
+                  collection_name="test_collection")),
+          "URI must be provided"),
+      # Invalid write config - empty collection name.
+      (
+          lambda: (
+              MilvusConnectionParameters(uri="http://localhost:19530"),
+              MilvusWriteConfig(collection_name="")),
+          "Collection name must be provided"),
+  ])
+  def test_invalid_configuration_parameters(
+      self, create_params, expected_error_msg):
+    """Test validation errors for invalid configuration parameters."""
+    with self.assertRaises(ValueError) as context:
+      connection_params, write_config = create_params()
+      MilvusVectorWriterConfig(
+          connection_params=connection_params, write_config=write_config)
+
+    self.assertIn(expected_error_msg, str(context.exception))
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
index eca740a4e9c3..d07edff83928 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
@@ -16,7 +16,7 @@
 
 import json
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Tuple
 from typing import Callable
 from typing import Dict
 from typing import List
@@ -311,6 +311,42 @@ def value_fn(chunk: Chunk) -> Any:
         ColumnSpec.vector(column_name=column_name, value_fn=value_fn))
     return self
 
+  def with_sparse_embedding_spec(
+      self,
+      column_name: str = "sparse_embedding",
+      conv_fn: Optional[Callable[[Tuple[List[int], List[float]]], Any]] = None
+  ) -> 'ColumnSpecsBuilder':
+    """Add sparse embedding :class:`.ColumnSpec` with optional conversion.
+
+      Args:
+          column_name: Name for the sparse embedding column
+            (defaults to "sparse_embedding")
+          conv_fn: Optional function to convert the sparse embedding tuple
+                      If None, converts to PostgreSQL-compatible JSON format
+
+      Returns:
+          Self for method chaining
+
+      Example:
+          >>> builder.with_sparse_embedding_spec(
+          ...     column_name="sparse_vector",
+          ...     convert_fn=lambda sparse: dict(zip(sparse[0], sparse[1]))
+          ... )
+      """
+    def value_fn(chunk: Chunk) -> Any:
+      if chunk.embedding is None or chunk.embedding.sparse_embedding is None:
+        raise ValueError(f'Expected chunk to contain sparse embedding. {chunk}')
+      sparse_embedding = chunk.embedding.sparse_embedding
+      if conv_fn:
+        return conv_fn(sparse_embedding)
+      # Default: convert to dict format for JSON storage.
+      indices, values = sparse_embedding
+      return json.dumps(dict(zip(indices, values)))
+
+    self._specs.append(
+        ColumnSpec.jsonb(column_name=column_name, value_fn=value_fn))
+    return self
+
   def add_metadata_field(
       self,
       field: str,
diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
new file mode 100644
index 000000000000..9a46f46397eb
--- /dev/null
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -0,0 +1,304 @@
+import contextlib
+from dataclasses import dataclass
+import os
+import socket
+import tempfile
+import logging
+from typing import Dict, List, Optional
+from testcontainers.core.config import testcontainers_config
+from testcontainers.core.generic import DbContainer
+from testcontainers.milvus import MilvusContainer
+import yaml
+
+from apache_beam.ml.rag.types import Chunk
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@dataclass
+class VectorDBContainerInfo:
+  """Container information for vector database test instances.
+
+  Holds connection details and container reference for testing with
+  vector databases like Milvus in containerized environments.
+  """
+  container: DbContainer
+  host: str
+  port: int
+  user: str = ""
+  password: str = ""
+  token: str = ""
+  id: str = "default"
+
+  @property
+  def uri(self) -> str:
+    return f"http://{self.host}:{self.port}"
+
+
+class TestHelpers:
+  @staticmethod
+  def find_free_port():
+    """Find a free port on the local machine."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+      # Bind to port 0, which asks OS to assign a free port.
+      s.bind(('', 0))
+      s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+      # Return the port number assigned by OS.
+      return s.getsockname()[1]
+
+
+class CustomMilvusContainer(MilvusContainer):
+  """Custom Milvus container with configurable ports and environment setup.
+
+  Extends MilvusContainer to provide custom port binding and environment
+  configuration for testing with standalone Milvus instances.
+  """
+  def __init__(
+      self,
+      image: str,
+      service_container_port,
+      healthcheck_container_port,
+      **kwargs,
+  ) -> None:
+    # Skip the parent class's constructor and go straight to
+    # GenericContainer.
+    super(MilvusContainer, self).__init__(image=image, **kwargs)
+    self.port = service_container_port
+    self.healthcheck_port = healthcheck_container_port
+    self.with_exposed_ports(service_container_port, healthcheck_container_port)
+
+    # Get free host ports.
+    service_host_port = TestHelpers.find_free_port()
+    healthcheck_host_port = TestHelpers.find_free_port()
+
+    # Bind container and host ports.
+    self.with_bind_ports(service_container_port, service_host_port)
+    self.with_bind_ports(healthcheck_container_port, healthcheck_host_port)
+    self.cmd = "milvus run standalone"
+
+    # Set environment variables needed for Milvus.
+    envs = {
+        "ETCD_USE_EMBED": "true",
+        "ETCD_DATA_DIR": "/var/lib/milvus/etcd",
+        "COMMON_STORAGETYPE": "local",
+        "METRICS_PORT": str(healthcheck_container_port)
+    }
+    for env, value in envs.items():
+      self.with_env(env, value)
+
+
+class MilvusTestHelpers:
+  """Helper utilities for testing Milvus vector database operations.
+
+  Provides static methods for managing test containers, configuration files,
+  and chunk comparison utilities for Milvus-based integration tests.
+  """
+  # IMPORTANT: When upgrading the Milvus server version, ensure the pymilvus
+  # Python SDK client in setup.py is updated to match. Referring to the Milvus
+  # release notes compatibility matrix at
+  # https://milvus.io/docs/release_notes.md or PyPI at
+  # https://pypi.org/project/pymilvus/ for version compatibility.
+  # Example: Milvus v2.6.0 requires pymilvus==2.6.0 (exact match required).
+  @staticmethod
+  def start_db_container(
+      image="milvusdb/milvus:v2.5.10",
+      max_vec_fields=5,
+      vector_client_max_retries=3,
+      tc_max_retries=None) -> Optional[VectorDBContainerInfo]:
+    service_container_port = TestHelpers.find_free_port()
+    healthcheck_container_port = TestHelpers.find_free_port()
+    user_yaml_creator = MilvusTestHelpers.create_user_yaml
+    with user_yaml_creator(service_container_port, max_vec_fields) as cfg:
+      info = None
+      original_tc_max_tries = testcontainers_config.max_tries
+      if tc_max_retries is not None:
+        testcontainers_config.max_tries = tc_max_retries
+      for i in range(vector_client_max_retries):
+        try:
+          vector_db_container = CustomMilvusContainer(
+              image=image,
+              service_container_port=service_container_port,
+              healthcheck_container_port=healthcheck_container_port)
+          vector_db_container = vector_db_container.with_volume_mapping(
+              cfg, "/milvus/configs/user.yaml")
+          vector_db_container.start()
+          host = vector_db_container.get_container_host_ip()
+          port = vector_db_container.get_exposed_port(service_container_port)
+          info = VectorDBContainerInfo(vector_db_container, host, port)
+          _LOGGER.info(
+              "milvus db container started successfully on %s.", info.uri)
+        except Exception as e:
+          stdout_logs, stderr_logs = vector_db_container.get_logs()
+          stdout_logs = stdout_logs.decode("utf-8")
+          stderr_logs = stderr_logs.decode("utf-8")
+          _LOGGER.warning(
+              "Retry %d/%d: Failed to start Milvus DB container. Reason: %s. "
+              "STDOUT logs:\n%s\nSTDERR logs:\n%s",
+              i + 1,
+              vector_client_max_retries,
+              e,
+              stdout_logs,
+              stderr_logs)
+          if i == vector_client_max_retries - 1:
+            _LOGGER.error(
+                "Unable to start milvus db container for I/O tests after %d "
+                "retries. Tests cannot proceed. STDOUT logs:\n%s\n"
+                "STDERR logs:\n%s",
+                vector_client_max_retries,
+                stdout_logs,
+                stderr_logs)
+            raise e
+        finally:
+          testcontainers_config.max_tries = original_tc_max_tries
+      return info
+
+  @staticmethod
+  def stop_db_container(db_info: VectorDBContainerInfo):
+    if db_info is None:
+      _LOGGER.warning("Milvus db info is None. Skipping stop operation.")
+      return
+    _LOGGER.debug("Stopping milvus db container.")
+    db_info.container.stop()
+    _LOGGER.info("milvus db container stopped successfully.")
+
+  @staticmethod
+  @contextlib.contextmanager
+  def create_user_yaml(service_port: int, max_vector_field_num=5):
+    """Creates a temporary user.yaml file for Milvus configuration.
+
+      This user yaml file overrides Milvus default configurations. It sets
+      the Milvus service port to the specified container service port. The
+      default for maxVectorFieldNum is 4, but we need 5
+      (one unique field for each metric).
+
+      Args:
+        service_port: Port number for the Milvus service.
+        max_vector_field_num: Max number of vec fields allowed per collection.
+
+      Yields:
+          str: Path to the created temporary yaml file.
+      """
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                     delete=False) as temp_file:
+      # Define the content for user.yaml.
+      user_config = {
+          'proxy': {
+              'maxVectorFieldNum': max_vector_field_num, 'port': service_port
+          },
+          'etcd': {
+              'use': {
+                  'embed': True
+              }, 'data': {
+                  'dir': '/var/lib/milvus/etcd'
+              }
+          }
+      }
+
+      # Write the content to the file.
+      yaml.dump(user_config, temp_file, default_flow_style=False)
+      path = temp_file.name
+
+    try:
+      yield path
+    finally:
+      if os.path.exists(path):
+        os.remove(path)
+
+  @staticmethod
+  def assert_chunks_equivalent(
+      actual_chunks: List[Chunk], expected_chunks: List[Chunk]):
+    """assert_chunks_equivalent checks for presence rather than exact match"""
+    # Sort both lists by ID to ensure consistent ordering.
+    actual_sorted = sorted(actual_chunks, key=lambda c: c.id)
+    expected_sorted = sorted(expected_chunks, key=lambda c: c.id)
+
+    actual_len = len(actual_sorted)
+    expected_len = len(expected_sorted)
+    err_msg = (
+        f"Different number of chunks, actual: {actual_len}, "
+        f"expected: {expected_len}")
+    assert actual_len == expected_len, err_msg
+
+    for actual, expected in zip(actual_sorted, expected_sorted):
+      # Assert that IDs match.
+      assert actual.id == expected.id
+
+      # Assert that dense embeddings match.
+      err_msg = f"Dense embedding mismatch for chunk {actual.id}"
+      assert actual.dense_embedding == expected.dense_embedding, err_msg
+
+      # Assert that sparse embeddings match.
+      err_msg = f"Sparse embedding mismatch for chunk {actual.id}"
+      assert actual.sparse_embedding == expected.sparse_embedding, err_msg
+
+      # Assert that text content match.
+      err_msg = f"Text Content mismatch for chunk {actual.id}"
+      assert actual.content.text == expected.content.text, err_msg
+
+      # For enrichment_data, be more flexible.
+      # If "expected" has values for enrichment_data but actual doesn't, that's
+      # acceptable since vector search results can vary based on many factors
+      # including implementation details, vector database state, and slight
+      # variations in similarity calculations.
+
+      # First ensure the enrichment data key exists.
+      err_msg = f"Missing enrichment_data key in chunk {actual.id}"
+      assert 'enrichment_data' in actual.metadata, err_msg
+
+      # For enrichment_data, ensure consistent ordering of results.
+      actual_data = actual.metadata['enrichment_data']
+      expected_data = expected.metadata['enrichment_data']
+
+      # If actual has enrichment data, then perform detailed validation.
+      if actual_data:
+        # Ensure the id key exist.
+        err_msg = f"Missing id key in metadata {actual.id}"
+        assert 'id' in actual_data, err_msg
+
+        # Validate IDs have consistent ordering.
+        actual_ids = sorted(actual_data['id'])
+        expected_ids = sorted(expected_data['id'])
+        err_msg = f"IDs in enrichment_data don't match for chunk {actual.id}"
+        assert actual_ids == expected_ids, err_msg
+
+        # Ensure the distance key exist.
+        err_msg = f"Missing distance key in metadata {actual.id}"
+        assert 'distance' in actual_data, err_msg
+
+        # Validate distances exist and have same length as IDs.
+        actual_distances = actual_data['distance']
+        expected_distances = expected_data['distance']
+        err_msg = (
+            "Number of distances doesn't match number of IDs for "
+            f"chunk {actual.id}")
+        assert len(actual_distances) == len(expected_distances), err_msg
+
+        # Ensure the fields key exist.
+        err_msg = f"Missing fields key in metadata {actual.id}"
+        assert 'fields' in actual_data, err_msg
+
+        # Validate fields have consistent content.
+        # Sort fields by 'id' to ensure consistent ordering.
+        actual_fields_sorted = sorted(
+            actual_data['fields'], key=lambda f: f.get('id', 0))
+        expected_fields_sorted = sorted(
+            expected_data['fields'], key=lambda f: f.get('id', 0))
+
+        # Compare field IDs.
+        actual_field_ids = [f.get('id') for f in actual_fields_sorted]
+        expected_field_ids = [f.get('id') for f in expected_fields_sorted]
+        err_msg = f"Field IDs don't match for chunk {actual.id}"
+        assert actual_field_ids == expected_field_ids, err_msg
+
+        # Compare field content.
+        for a_f, e_f in zip(actual_fields_sorted, expected_fields_sorted):
+          # Ensure the id key exist.
+          err_msg = f"Missing id key in metadata.fields {actual.id}"
+          assert 'id' in a_f, err_msg
+
+          err_msg = f"Field ID mismatch chunk {actual.id}"
+          assert a_f['id'] == e_f['id'], err_msg
+
+          # Validate field metadata.
+          err_msg = f"Field Metadata doesn't match for chunk {actual.id}"
+          assert a_f['metadata'] == e_f['metadata'], err_msg
diff --git a/sdks/python/apache_beam/ml/rag/utils.py b/sdks/python/apache_beam/ml/rag/utils.py
new file mode 100644
index 000000000000..c9c39d074c4a
--- /dev/null
+++ b/sdks/python/apache_beam/ml/rag/utils.py
@@ -0,0 +1,129 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+import uuid
+
+from apache_beam.ml.rag.types import Chunk, Content, Embedding
+
+# Default batch size for writing data to Milvus, matching
+# JdbcIO.DEFAULT_BATCH_SIZE.
+DEFAULT_WRITE_BATCH_SIZE = 1000
+
+
+@dataclass
+class MilvusConnectionParameters:
+  """Configurations for establishing connections to Milvus servers.
+
+  Args:
+    uri: URI endpoint for connecting to Milvus server in the format
+      "http(s)://hostname:port".
+    user: Username for authentication. Required if authentication is enabled and
+      not using token authentication.
+    password: Password for authentication. Required if authentication is enabled
+      and not using token authentication.
+    db_name: Database Name to connect to. Specifies which Milvus database to
+      use. Defaults to 'default'.
+    token: Authentication token as an alternative to username/password.
+    timeout: Connection timeout in seconds. Uses client default if None.
+    kwargs: Optional keyword arguments for additional connection parameters.
+      Enables forward compatibility.
+  """
+  uri: str
+  user: str = field(default_factory=str)
+  password: str = field(default_factory=str)
+  db_name: str = "default"
+  token: str = field(default_factory=str)
+  timeout: Optional[float] = None
+  kwargs: Dict[str, Any] = field(default_factory=dict)
+
+  def __post_init__(self):
+    if not self.uri:
+      raise ValueError("URI must be provided for Milvus connection")
+
+    # Generate unique alias if not provided. One-to-one mapping between alias
+    # and connection - each alias represents exactly one Milvus connection.
+    if "alias" not in self.kwargs:
+      alias = f"milvus_conn_{uuid.uuid4().hex[:8]}"
+      self.kwargs["alias"] = alias
+
+
+class MilvusHelpers:
+  """Utility class providing helper methods for Milvus vector db operations."""
+  @staticmethod
+  def sparse_embedding(
+      sparse_vector: Tuple[List[int],
+                           List[float]]) -> Optional[Dict[int, float]]:
+    if not sparse_vector:
+      return None
+    # Converts sparse embedding from (indices, values) tuple format to
+    # Milvus-compatible values dict format {dimension_index: value, ...}.
+    indices, values = sparse_vector
+    return {int(idx): float(val) for idx, val in zip(indices, values)}
+
+
+def parse_chunk_strings(chunk_str_list: List[str]) -> List[Chunk]:
+  parsed_chunks = []
+
+  # Define safe globals and disable built-in functions for safety.
+  safe_globals = {
+      'Chunk': Chunk,
+      'Content': Content,
+      'Embedding': Embedding,
+      'defaultdict': defaultdict,
+      'list': list,
+      '__builtins__': {}
+  }
+
+  for raw_str in chunk_str_list:
+    try:
+      # replace "<class 'list'>" with actual list reference.
+      cleaned_str = re.sub(
+          r"defaultdict\(<class 'list'>", "defaultdict(list", raw_str)
+
+      # Evaluate string in restricted environment.
+      chunk = eval(cleaned_str, safe_globals)  # pylint: disable=eval-used
+      if isinstance(chunk, Chunk):
+        parsed_chunks.append(chunk)
+      else:
+        raise ValueError("Parsed object is not a Chunk instance")
+    except Exception as e:
+      raise ValueError(f"Error parsing string:\n{raw_str}\n{e}")
+
+  return parsed_chunks
+
+
+def unpack_dataclass_with_kwargs(dataclass_instance):
+  """Unpacks dataclass fields into a flat dict, merging kwargs with precedence.
+
+  Args:
+    dataclass_instance: Dataclass instance to unpack.
+
+  Returns:
+    dict: Flattened dictionary with kwargs taking precedence over fields.
+  """
+  # Create a copy of the dataclass's __dict__.
+  params_dict: dict = dataclass_instance.__dict__.copy()
+
+  # Extract the nested kwargs dictionary.
+  nested_kwargs = params_dict.pop('kwargs', {})
+
+  # Merge the dictionaries, with nested_kwargs taking precedence
+  # in case of duplicate keys.
+  return {**params_dict, **nested_kwargs}

From f3a0b880a8769947f21877aa3ad69d5901e4b56b Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 16:41:16 +0000
Subject: [PATCH 05/35] sdks/python: fix linting issues

---
 .../apache_beam/ml/rag/enrichment/milvus_search.py    |  1 -
 .../ml/rag/enrichment/milvus_search_it_test.py        | 11 -----------
 .../apache_beam/ml/rag/ingestion/milvus_search.py     |  4 +---
 .../ml/rag/ingestion/milvus_search_it_test.py         |  6 +++++-
 sdks/python/apache_beam/ml/rag/test_utils.py          |  2 +-
 5 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
index d488c8d3d80d..7a0c38d6d90e 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
@@ -25,7 +25,6 @@
 from typing import Optional
 from typing import Tuple
 from typing import Union
-import uuid
 
 from google.protobuf.json_format import MessageToDict
 from pymilvus import AnnSearchRequest
diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index 094788664bdb..ed6f52e004fa 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -15,25 +15,17 @@
 # limitations under the License.
 #
 
-import contextlib
 import logging
-import os
 import platform
-import re
-import socket
-import tempfile
 import unittest
-from collections import defaultdict
 from dataclasses import dataclass
 from dataclasses import field
 from typing import Callable
 from typing import Dict
 from typing import List
-from typing import Optional
 from typing import cast
 
 import pytest
-import yaml
 
 import apache_beam as beam
 from apache_beam.ml.rag.types import Chunk
@@ -53,9 +45,6 @@
       MilvusClient,
       RRFRanker)
   from pymilvus.milvus_client import IndexParams
-  from testcontainers.core.config import testcontainers_config
-  from testcontainers.core.generic import DbContainer
-  from testcontainers.milvus import MilvusContainer
   from apache_beam.transforms.enrichment import Enrichment
   from apache_beam.ml.rag.test_utils import (
       MilvusTestHelpers, VectorDBContainerInfo)
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
index 041349efeb77..e93dbbef776f 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, NamedTuple, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 from pymilvus import MilvusClient
 
@@ -29,7 +29,6 @@
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.utils import (
     MilvusHelpers, unpack_dataclass_with_kwargs, DEFAULT_WRITE_BATCH_SIZE)
-from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
 from apache_beam.transforms import DoFn
 
 from apache_beam.ml.rag.utils import MilvusConnectionParameters
@@ -117,7 +116,6 @@ def create_converter(self) -> Callable[[Chunk], Dict[str, Any]]:
       A function that takes a Chunk and returns a dictionary representing
       a Milvus record with fields mapped according to column_specs.
     """
-    """Creates a function to convert Chunks to records."""
     def convert(chunk: Chunk) -> Dict[str, Any]:
       result = {}
       for col in self.column_specs:
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
index f8f01d9d5964..b2871ce431ef 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -112,7 +112,11 @@ def create_collection_with_partition(
     client: MilvusClient,
     collection_name: str,
     partition_name: str = '',
-    fields=MILVUS_INGESTION_IT_CONFIG["fields"]):
+    fields=None):
+
+  if fields is None:
+    fields = MILVUS_INGESTION_IT_CONFIG["fields"]
+
   # Configure schema.
   schema = CollectionSchema(fields=fields)
 
diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 9a46f46397eb..4e87f2e884a1 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -4,7 +4,7 @@
 import socket
 import tempfile
 import logging
-from typing import Dict, List, Optional
+from typing import List, Optional
 from testcontainers.core.config import testcontainers_config
 from testcontainers.core.generic import DbContainer
 from testcontainers.milvus import MilvusContainer

From 4cbe014851de5db9280930d5c6366690df24e0e3 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 16:42:13 +0000
Subject: [PATCH 06/35] sdks/python: add missing apache beam liscense header
 for `test_utils.py`

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 4e87f2e884a1..325babdc7037 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 import contextlib
 from dataclasses import dataclass
 import os

From 461c8fee9d1d4b63b63558d188f88f3e79856309 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 17:18:21 +0000
Subject: [PATCH 07/35] notebooks/beam-ml: use new refactored code in milvus
 enrichment handler

---
 .../beam-ml/milvus_enrichment_transform.ipynb | 338 +++++++++++++-----
 1 file changed, 243 insertions(+), 95 deletions(-)

diff --git a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb
index 2dbd038f3086..113038e56984 100644
--- a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb
+++ b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 14,
    "id": "47053bac",
    "metadata": {},
    "outputs": [],
@@ -67,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 15,
    "id": "e550cd55-e91e-4d43-b1bd-b0e89bb8cbd9",
    "metadata": {},
    "outputs": [],
@@ -80,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 16,
    "id": "31747c45-107a-49be-8885-5a6cc9dc1236",
    "metadata": {},
    "outputs": [
@@ -88,9 +88,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n",
-      "\u001b[0m\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n",
-      "\u001b[0m"
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
      ]
     }
    ],
@@ -103,19 +106,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 17,
    "id": "666e0c2b-0341-4b0e-8d73-561abc39bb10",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/dev/beam/sdks/python/.venv/lib/python3.9/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'validate_default' attribute with value True was provided to the `Field()` function, which has no effect in the context it was used. 'validate_default' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Standard library imports\n",
     "from collections import defaultdict\n",
@@ -149,13 +143,13 @@
     "from apache_beam.ml.rag.types import Chunk, Content, Embedding\n",
     "from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider\n",
     "from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings\n",
-    "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusEnrichmentTestHelper\n",
+    "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusTestHelpers\n",
+    "from apache_beam.ml.rag.utils import MilvusConnectionParameters\n",
     "from apache_beam.ml.rag.enrichment.milvus_search import (\n",
     "    HybridSearchParameters, \n",
     "    KeywordSearchMetrics, \n",
     "    KeywordSearchParameters,\n",
     "    MilvusCollectionLoadParameters, \n",
-    "    MilvusConnectionParameters, \n",
     "    MilvusSearchEnrichmentHandler,\n",
     "    MilvusSearchParameters, \n",
     "    SearchStrategy, \n",
@@ -194,7 +188,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 18,
    "id": "38781cf5-e18f-40f5-827e-2d441ae7d2fa",
    "metadata": {},
    "outputs": [],
@@ -287,7 +281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 19,
    "id": "489e93b6-de41-4ec3-be33-a15c3cba12e8",
    "metadata": {},
    "outputs": [
@@ -364,7 +358,7 @@
        "max    312.000000"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -379,7 +373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 20,
    "id": "eb32aad0-febd-45af-b4bd-e2176b07e2dc",
    "metadata": {},
    "outputs": [
@@ -424,7 +418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 21,
    "id": "5ae9bc82-9ad7-46dd-b254-19cbdcdd0e07",
    "metadata": {},
    "outputs": [],
@@ -435,30 +429,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 22,
    "id": "aff7b261-3330-4fa9-9a54-3fd87b42521f",
    "metadata": {},
    "outputs": [],
    "source": [
     "if db:\n",
     "    # Stop existing Milvus DB container to prevent duplicates.\n",
-    "    MilvusEnrichmentTestHelper.stop_db_container(db)\n",
-    "db = MilvusEnrichmentTestHelper.start_db_container(milvus_version)"
+    "    MilvusTestHelpers.stop_db_container(db)\n",
+    "db = MilvusTestHelpers.start_db_container(milvus_version)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 26,
    "id": "31496ee0-75a2-48ad-954e-9c4ae5abbf5e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "milvus_connection_parameters = MilvusConnectionParameters(uri=db.uri, user=db.user, password=db.password, db_id=db.id)"
+    "milvus_connection_parameters = MilvusConnectionParameters(uri=db.uri, user=db.user, password=db.password, db_name=db.id)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 27,
    "id": "82627714-2425-4058-9b47-d262f015caf7",
    "metadata": {},
    "outputs": [],
@@ -468,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 28,
    "id": "e8a85f51-5d5f-4533-bf0f-ec825e613dc2",
    "metadata": {},
    "outputs": [
@@ -478,7 +472,7 @@
        "'2.5.10'"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -505,7 +499,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 29,
    "id": "e3847821-069c-412f-8c20-2406bcac1e55",
    "metadata": {},
    "outputs": [],
@@ -520,7 +514,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 30,
    "id": "c014af94-1bb7-44e4-842c-1039f4a2a11d",
    "metadata": {},
    "outputs": [],
@@ -545,7 +539,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 31,
    "id": "54fb3428-b007-4804-9d79-b3933d3256c5",
    "metadata": {},
    "outputs": [],
@@ -561,7 +555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 32,
    "id": "4c2f123a-5949-4974-af48-a5db5b168c11",
    "metadata": {},
    "outputs": [
@@ -571,7 +565,7 @@
        "{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}, {'name': 'content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65279}}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'sparse_embedding', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}, {'name': 'metadata', 'description': '', 'type': <DataType.JSON: 23>}, {'name': 'title_and_content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535, 'enable_analyzer': True}}], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['title_and_content'], 'output_field_names': ['sparse_embedding'], 'params': {}}]}"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -591,7 +585,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 33,
    "id": "671f4352-2086-4428-83be-0de48926682d",
    "metadata": {},
    "outputs": [],
@@ -609,7 +603,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 34,
    "id": "aa8baae5-7c38-4e78-ace4-304c7dc6b127",
    "metadata": {},
    "outputs": [],
@@ -632,7 +626,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 35,
    "id": "d970a35b-f9b2-4f8f-93ef-8de5c83c31b5",
    "metadata": {},
    "outputs": [],
@@ -647,7 +641,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 36,
    "id": "0d45a6ad-2009-4e30-b38d-73266da98a06",
    "metadata": {},
    "outputs": [
@@ -658,7 +652,7 @@
        " {'field_name': 'sparse_embedding', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -677,7 +671,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 37,
    "id": "51dd4423-240c-4271-bb8c-6270f399a25c",
    "metadata": {},
    "outputs": [],
@@ -687,7 +681,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 38,
    "id": "9620b1f2-51fa-491c-ad3f-f0676b9b25f6",
    "metadata": {},
    "outputs": [],
@@ -697,7 +691,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 39,
    "id": "e6cf3a1d-265c-44db-aba8-d491fab290d5",
    "metadata": {},
    "outputs": [],
@@ -707,7 +701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 40,
    "id": "94497411-43d3-4300-98b3-1cb33759738e",
    "metadata": {},
    "outputs": [
@@ -717,7 +711,7 @@
        "True"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 40,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -736,7 +730,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 41,
    "id": "25c5c202-abe0-4d11-82df-e731f0d6201e",
    "metadata": {
     "scrolled": true
@@ -783,6 +777,160 @@
       "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n"
      ]
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb92c794ace141d6a6673d8cb5cffc54",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "69b2041978344ba9ae81a0dd25ff8026",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef680b99b80f4d9cabc07fd5859da49a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0af1d00432d64f54a3e099f1748236e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "743a8a89e2884054b5e27f5a853796c3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f1984aab1be345a1b790c6d21914f089",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0bad3ae7da7d4730b54f202689653cd6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0c1db6d4674c40018a055afe1c62fc4a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.txt: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cf985842345a41d1b799e159753ea151",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0e2587ee08884756b1e57328a3c08099",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8af2abd4dce47c2837edd83631ca7d0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -795,7 +943,7 @@
       "text/html": [
        "\n",
        "            <link rel=\"stylesheet\" href=\"https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css\" integrity=\"sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh\" crossorigin=\"anonymous\">\n",
-       "            <div id=\"progress_indicator_ef090119901644a31067b90f8d98d385\">\n",
+       "            <div id=\"progress_indicator_c9bffbfccdefa2af8f6baaa176074c9e\">\n",
        "              <div class=\"spinner-border text-info\" role=\"status\"></div>\n",
        "              <span class=\"text-info\">Processing... show</span>\n",
        "            </div>\n",
@@ -830,7 +978,7 @@
        "            }\n",
        "            </style>\n",
        "            <link rel=\"stylesheet\" href=\"https://cdn.datatables.net/1.10.20/css/jquery.dataTables.min.css\">\n",
-       "            <table id=\"table_df_08499c8cd95657156c076a29cd68a254\" class=\"display\" style=\"display:block\"></table>\n",
+       "            <table id=\"table_df_1f485acbe7f51ee8188432042c1136e9\" class=\"display\" style=\"display:block\"></table>\n",
        "            <script>\n",
        "              \n",
        "        if (typeof window.interactive_beam_jquery == 'undefined') {\n",
@@ -846,10 +994,10 @@
        "              window.interactive_beam_jquery(document).ready(function($){\n",
        "                \n",
        "            var dt;\n",
-       "            if ($.fn.dataTable.isDataTable(\"#table_df_08499c8cd95657156c076a29cd68a254\")) {\n",
-       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable();\n",
-       "            } else if ($(\"#table_df_08499c8cd95657156c076a29cd68a254_wrapper\").length == 0) {\n",
-       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable({\n",
+       "            if ($.fn.dataTable.isDataTable(\"#table_df_1f485acbe7f51ee8188432042c1136e9\")) {\n",
+       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable();\n",
+       "            } else if ($(\"#table_df_1f485acbe7f51ee8188432042c1136e9_wrapper\").length == 0) {\n",
+       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable({\n",
        "                \n",
        "            bAutoWidth: false,\n",
        "            columns: [{'title': ''}, {'title': 'id'}, {'title': 'content'}, {'title': 'title_and_content'}, {'title': 'metadata'}, {'title': 'embedding'}],\n",
@@ -883,10 +1031,10 @@
        "          window.interactive_beam_jquery(document).ready(function($){\n",
        "            \n",
        "            var dt;\n",
-       "            if ($.fn.dataTable.isDataTable(\"#table_df_08499c8cd95657156c076a29cd68a254\")) {\n",
-       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable();\n",
-       "            } else if ($(\"#table_df_08499c8cd95657156c076a29cd68a254_wrapper\").length == 0) {\n",
-       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable({\n",
+       "            if ($.fn.dataTable.isDataTable(\"#table_df_1f485acbe7f51ee8188432042c1136e9\")) {\n",
+       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable();\n",
+       "            } else if ($(\"#table_df_1f485acbe7f51ee8188432042c1136e9_wrapper\").length == 0) {\n",
+       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable({\n",
        "                \n",
        "            bAutoWidth: false,\n",
        "            columns: [{'title': ''}, {'title': 'id'}, {'title': 'content'}, {'title': 'title_and_content'}, {'title': 'metadata'}, {'title': 'embedding'}],\n",
@@ -938,7 +1086,7 @@
        "              window.interactive_beam_jquery = jQuery.noConflict(true);\n",
        "              window.interactive_beam_jquery(document).ready(function($){\n",
        "                \n",
-       "            $(\"#progress_indicator_ef090119901644a31067b90f8d98d385\").remove();\n",
+       "            $(\"#progress_indicator_c9bffbfccdefa2af8f6baaa176074c9e\").remove();\n",
        "              });\n",
        "            }\n",
        "            document.head.appendChild(datatableScript);\n",
@@ -947,7 +1095,7 @@
        "        } else {\n",
        "          window.interactive_beam_jquery(document).ready(function($){\n",
        "            \n",
-       "            $(\"#progress_indicator_ef090119901644a31067b90f8d98d385\").remove();\n",
+       "            $(\"#progress_indicator_c9bffbfccdefa2af8f6baaa176074c9e\").remove();\n",
        "          });\n",
        "        }"
       ]
@@ -1060,7 +1208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 42,
    "id": "4911e8cc-10f1-4d21-9251-1b756b61f2c1",
    "metadata": {},
    "outputs": [],
@@ -1134,7 +1282,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 43,
    "id": "74db1238-0a04-4e08-818d-5bce8f09006b",
    "metadata": {},
    "outputs": [],
@@ -1145,7 +1293,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 44,
    "id": "79e16531-8bec-4b4b-9ed3-cebd705480e0",
    "metadata": {},
    "outputs": [],
@@ -1158,7 +1306,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 45,
    "id": "cbef1911-6464-4ba1-8974-ed00896c7e8b",
    "metadata": {},
    "outputs": [],
@@ -1168,7 +1316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 46,
    "id": "f0481286-3f2b-4690-a2f6-a5a00de3ff34",
    "metadata": {},
    "outputs": [],
@@ -1181,7 +1329,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 47,
    "id": "35ee37f2-60cd-4d5d-aef6-aed4fda79161",
    "metadata": {},
    "outputs": [
@@ -1494,7 +1642,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 48,
    "id": "f159ad87-5153-48bb-87b3-3845d3c76420",
    "metadata": {},
    "outputs": [],
@@ -1505,7 +1653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 49,
    "id": "8b8cad3e-8a18-464b-8de6-aa4515a653c5",
    "metadata": {},
    "outputs": [],
@@ -1518,7 +1666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 50,
    "id": "47cfc650-0b34-4333-9321-19be2e8fdc85",
    "metadata": {},
    "outputs": [],
@@ -1528,7 +1676,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 51,
    "id": "4754763b-66bf-4f90-9920-28cef223b536",
    "metadata": {},
    "outputs": [],
@@ -1541,7 +1689,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 52,
    "id": "a3db4837-01c7-42d7-b4e8-58d8d361fe93",
    "metadata": {},
    "outputs": [
@@ -1663,7 +1811,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 53,
    "id": "172b6c80-2a03-49d0-afc7-12bb0a4dc989",
    "metadata": {},
    "outputs": [],
@@ -1674,7 +1822,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 54,
    "id": "eb6d951c-0def-45cc-84a4-b6f7b7575f23",
    "metadata": {},
    "outputs": [],
@@ -1688,7 +1836,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 55,
    "id": "b339c498-d229-42e6-b439-b29eb107b533",
    "metadata": {},
    "outputs": [],
@@ -1701,7 +1849,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 56,
    "id": "b346abe6-03c9-4b28-a0fb-74936b9f3a06",
    "metadata": {},
    "outputs": [],
@@ -1711,7 +1859,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 57,
    "id": "ab27810d-40a8-4b6a-bc82-441e13763ebc",
    "metadata": {},
    "outputs": [],
@@ -1724,7 +1872,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 58,
    "id": "ea9d84f7-d142-4afa-9a6f-6c310d9604b0",
    "metadata": {},
    "outputs": [
@@ -1904,7 +2052,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 59,
    "id": "3d267853-649d-494f-bea6-bbfe20650f79",
    "metadata": {},
    "outputs": [],
@@ -1915,7 +2063,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 60,
    "id": "28a45b1c-f9a5-452e-aea6-ac46f17e01bd",
    "metadata": {},
    "outputs": [],
@@ -1931,7 +2079,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 61,
    "id": "9ce3f0c7-fd1d-49a1-81e9-b8153cd284ea",
    "metadata": {},
    "outputs": [],
@@ -1941,7 +2089,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 62,
    "id": "6fad29b5-c2b0-4458-ab83-b38eb15a7505",
    "metadata": {},
    "outputs": [],
@@ -1954,7 +2102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 63,
    "id": "77add8a8-ddb8-48de-b1af-632d78c0d112",
    "metadata": {},
    "outputs": [
@@ -2259,7 +2407,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 64,
    "id": "6e79ef5c-a121-4e69-9089-0991821f8745",
    "metadata": {},
    "outputs": [],
@@ -2270,7 +2418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 65,
    "id": "5314c531-14bb-4d81-92a5-fcf9cca7fa81",
    "metadata": {},
    "outputs": [],
@@ -2287,7 +2435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 66,
    "id": "0ecf2ac6-cf90-4ce7-b17f-113af90ab950",
    "metadata": {},
    "outputs": [],
@@ -2297,7 +2445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 67,
    "id": "0cd92b69-b9dc-445c-9bd7-21bb3ceb0fd3",
    "metadata": {},
    "outputs": [],
@@ -2310,7 +2458,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 68,
    "id": "b06ecf64-c314-4c6a-ae1a-4fdf059aeead",
    "metadata": {},
    "outputs": [
@@ -2471,7 +2619,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 69,
    "id": "a8077395-c374-400f-abdc-fe6630eab8a4",
    "metadata": {},
    "outputs": [],
@@ -2482,7 +2630,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 70,
    "id": "3b712779-f283-4e37-88ed-d6b65c6c45d2",
    "metadata": {},
    "outputs": [],
@@ -2495,7 +2643,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 71,
    "id": "7f0924a3-8832-4138-a599-d3aef648b962",
    "metadata": {},
    "outputs": [],
@@ -2505,7 +2653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 72,
    "id": "516ecbf0-9bb0-4177-829b-b79300b29bbe",
    "metadata": {},
    "outputs": [],
@@ -2518,7 +2666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 73,
    "id": "db32dda5-0668-4162-80ea-b6a0c2a79063",
    "metadata": {},
    "outputs": [
@@ -2623,12 +2771,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 75,
    "id": "0a3f4d66-3823-46c7-8a58-e9e8ac7899c8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "MilvusEnrichmentTestHelper.stop_db_container(db)\n",
+    "MilvusTestHelpers.stop_db_container(db)\n",
     "db = None"
    ]
   }
@@ -2649,7 +2797,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.24"
+   "version": "3.10.19"
   }
  },
  "nbformat": 4,

From 9d418790f4e3539d50d2ee0548c513b21f42ea1f Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 17:28:48 +0000
Subject: [PATCH 08/35] CHANGES.md: update release notes

---
 CHANGES.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index 2ee557b8fef3..1ce9016f4cf1 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -75,6 +75,9 @@
 * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
 * Python examples added for Milvus search enrichment handler on [Beam Website](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-milvus/)
   including jupyter notebook example (Python) ([#36176](https://github.com/apache/beam/issues/36176)).
+* Milvus sink I/O connector added (Python) ([#36702](
+  https://github.com/apache/beam/issues/36702)). Now Beam has full support for
+  Milvus integration including Milvus enrichment and sink operations.
 
 ## Breaking Changes
 

From c64e9c9a68e1496e36fc0f2086ab26974bf0bd99 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 18:50:21 +0000
Subject: [PATCH 09/35] sdks/python: mark milvus itests with
 `require_docker_in_docker` marker

---
 .../apache_beam/ml/rag/ingestion/milvus_search_it_test.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
index b2871ce431ef..10dd78e8bee0 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -153,7 +153,7 @@ def drop_collection(client: MilvusClient, collection_name: str):
     pass
 
 
-@pytest.mark.uses_testcontainer
+@pytest.mark.require_docker_in_docker
 @unittest.skipUnless(
     platform.system() == "Linux",
     "Test runs only on Linux due to lack of support, as yet, for nested "

From 825bf30540cfaf10de338d17c13f523281287506 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 18:51:36 +0000
Subject: [PATCH 10/35] sdks/python: override milvus db version with the
 default

---
 .../apache_beam/ml/rag/ingestion/milvus_search_it_test.py     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
index 10dd78e8bee0..c8e852ca1197 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -165,12 +165,10 @@ class TestMilvusVectorWriterConfig(unittest.TestCase):
   """Integration tests for Milvus vector database ingestion functionality"""
 
   _db: VectorDBContainerInfo
-  _version = "milvusdb/milvus:v2.5.10"
 
   @classmethod
   def setUpClass(cls):
-    cls._db = MilvusTestHelpers.start_db_container(
-        cls._version, vector_client_max_retries=3)
+    cls._db = MilvusTestHelpers.start_db_container()
     cls._connection_config = MilvusConnectionParameters(
         uri=cls._db.uri,
         user=cls._db.user,

From e6569ba7d5df10dc03b75e990dc3b853d25acd2a Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 19:24:59 +0000
Subject: [PATCH 11/35] sdsk/python: add missing import in rag utils

---
 sdks/python/apache_beam/ml/rag/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sdks/python/apache_beam/ml/rag/utils.py b/sdks/python/apache_beam/ml/rag/utils.py
index c9c39d074c4a..ee841b84c99b 100644
--- a/sdks/python/apache_beam/ml/rag/utils.py
+++ b/sdks/python/apache_beam/ml/rag/utils.py
@@ -19,6 +19,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple
 import uuid
+import re
 
 from apache_beam.ml.rag.types import Chunk, Content, Embedding
 

From 281ea3f9fb2eece6cf8dc982a4d1669bb3db364a Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Fri, 31 Oct 2025 20:07:58 +0000
Subject: [PATCH 12/35] sdks/python: fix linting issue

---
 .../examples/snippets/transforms/elementwise/enrichment_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
index f303b4a670a2..ffc077a14f8e 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
@@ -405,7 +405,7 @@ def pre_milvus_enrichment() -> MilvusDBContainerInfo:
   def post_milvus_enrichment(db: MilvusDBContainerInfo):
     try:
       MilvusEnrichmentTestHelper.stop_db_container(db)
-    except Exception:
+    except Exception as e:
       raise TestContainerTeardownError(
           f"Milvus container failed to tear down: {str(e)}")
 

From 5a350b5889e0b0fc25b2125c44e452a2b0bee3ee Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sat, 1 Nov 2025 19:47:53 +0000
Subject: [PATCH 13/35] rag/ingestion/milvus_search_itest.py: ensure flushing
 in-memory data before querying

---
 .../ml/rag/ingestion/milvus_search_it_test.py            | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
index c8e852ca1197..e9555234bf69 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -315,6 +315,7 @@ def test_write_on_auto_id_primary_key(self):
     with self.write_test_pipeline as p:
       _ = (p | beam.Create(test_chunks) | config.create_write_transform())
 
+    self._test_client.flush(auto_id_collection)
     self._test_client.load_collection(auto_id_collection)
     result = self._test_client.query(
         collection_name=auto_id_collection,
@@ -340,6 +341,7 @@ def test_write_on_existent_collection_with_default_schema(self):
       _ = (p | beam.Create(test_chunks) | config.create_write_transform())
 
     # Verify data was written successfully.
+    self._test_client.flush(self._collection_name)
     self._test_client.load_collection(self._collection_name)
     result = self._test_client.query(
         collection_name=self._collection_name,
@@ -411,6 +413,7 @@ def test_write_with_custom_column_specifications(self):
       _ = (p | beam.Create(test_chunks) | config.create_write_transform())
 
     # Verify data was written successfully.
+    self._test_client.flush(self._collection_name)
     self._test_client.load_collection(self._collection_name)
     result = self._test_client.query(
         collection_name=self._collection_name,
@@ -462,7 +465,10 @@ def test_write_with_batching(self):
       _ = (p | beam.Create(test_chunks) | config.create_write_transform())
 
     # Verify all data was written successfully.
+    # Flush to persist all data to disk, then load collection for querying.
+    self._test_client.flush(self._collection_name)
     self._test_client.load_collection(self._collection_name)
+
     result = self._test_client.query(
         collection_name=self._collection_name,
         partition_names=[self._partition_name],
@@ -526,6 +532,7 @@ def test_idempotent_write(self):
           | "Write initial" >> config.create_write_transform())
 
     # Verify initial data was inserted (not existed before).
+    self._test_client.flush(self._collection_name)
     self._test_client.load_collection(self._collection_name)
     result = self._test_client.query(
         collection_name=self._collection_name,
@@ -567,6 +574,7 @@ def test_idempotent_write(self):
           | "Write update1" >> config.create_write_transform())
 
     # Verify update worked.
+    self._test_client.flush(self._collection_name)
     self._test_client.load_collection(self._collection_name)
     result = self._test_client.query(
         collection_name=self._collection_name,
@@ -591,6 +599,7 @@ def test_idempotent_write(self):
             | f"Write repeat{i+2}" >> config.create_write_transform())
 
       # Verify state hasn't changed after repeated updates.
+      self._test_client.flush(self._collection_name)
       self._test_client.load_collection(self._collection_name)
       result = self._test_client.query(
           collection_name=self._collection_name,

From ee64600ce411225e5fab75a8a56cab2615130976 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sat, 1 Nov 2025 20:59:44 +0000
Subject: [PATCH 14/35] sdks/python: fix linting issues

---
 .../ml/rag/enrichment/milvus_search.py        |  3 +-
 .../ml/rag/ingestion/milvus_search.py         | 21 ++++---
 .../ml/rag/ingestion/milvus_search_it_test.py | 22 +++----
 .../ml/rag/ingestion/milvus_search_test.py    |  1 +
 .../ml/rag/ingestion/postgres_common.py       | 59 ++++++++++---------
 sdks/python/apache_beam/ml/rag/test_utils.py  | 10 ++--
 sdks/python/apache_beam/ml/rag/utils.py       | 19 ++++--
 7 files changed, 77 insertions(+), 58 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
index 7a0c38d6d90e..867768423694 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
@@ -35,7 +35,8 @@
 
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Embedding
-from apache_beam.ml.rag.utils import MilvusHelpers, MilvusConnectionParameters
+from apache_beam.ml.rag.utils import MilvusConnectionParameters
+from apache_beam.ml.rag.utils import MilvusHelpers
 from apache_beam.transforms.enrichment import EnrichmentSourceHandler
 
 
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
index e93dbbef776f..15d91ceb7509 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
@@ -14,24 +14,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional
+import logging
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
 
 from pymilvus import MilvusClient
 
-import logging
-
 import apache_beam as beam
 from apache_beam.ml.rag.ingestion.base import VectorDatabaseWriteConfig
+from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
 from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpec
 from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpecsBuilder
-from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
 from apache_beam.ml.rag.types import Chunk
-from apache_beam.ml.rag.utils import (
-    MilvusHelpers, unpack_dataclass_with_kwargs, DEFAULT_WRITE_BATCH_SIZE)
+from apache_beam.ml.rag.utils import DEFAULT_WRITE_BATCH_SIZE
+from apache_beam.ml.rag.utils import MilvusConnectionParameters
+from apache_beam.ml.rag.utils import MilvusHelpers
+from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
 from apache_beam.transforms import DoFn
 
-from apache_beam.ml.rag.utils import MilvusConnectionParameters
 
 _LOGGER = logging.getLogger(__name__)
 
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
index e9555234bf69..083a2f76e664 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -16,26 +16,28 @@
 #
 
 import platform
-from typing import Callable, cast
 import unittest
 import uuid
+from typing import Callable
+from typing import cast
 
 import pytest
-from pymilvus import CollectionSchema, DataType, MilvusClient
+from pymilvus import CollectionSchema
+from pymilvus import DataType
 from pymilvus import FieldSchema
+from pymilvus import MilvusClient
 from pymilvus.milvus_client import IndexParams
 
 import apache_beam as beam
-
+from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
+from apache_beam.ml.rag.test_utils import MilvusTestHelpers
+from apache_beam.ml.rag.test_utils import VectorDBContainerInfo
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Content
 from apache_beam.ml.rag.types import Embedding
 from apache_beam.ml.rag.utils import MilvusConnectionParameters
-from apache_beam.ml.rag.test_utils import (
-    VectorDBContainerInfo, MilvusTestHelpers)
-from apache_beam.testing.test_pipeline import TestPipeline
-from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
 from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
+from apache_beam.testing.test_pipeline import TestPipeline
 
 try:
   from apache_beam.ml.rag.ingestion.milvus_search import (
@@ -79,7 +81,7 @@ def _construct_index_params():
     "index": _construct_index_params,
     "corpus": [
         Chunk(
-            id=1,
+            id=1,  # type: ignore[arg-type]
             content=Content(text="Test document one"),
             metadata={"source": "test1"},
             embedding=Embedding(
@@ -87,7 +89,7 @@ def _construct_index_params():
                 sparse_embedding=([1, 2], [0.1, 0.2])),
         ),
         Chunk(
-            id=2,
+            id=2,  # type: ignore[arg-type]
             content=Content(text="Test document two"),
             metadata={"source": "test2"},
             embedding=Embedding(
@@ -96,7 +98,7 @@ def _construct_index_params():
             ),
         ),
         Chunk(
-            id=3,
+            id=3,  # type: ignore[arg-type]
             content=Content(text="Test document three"),
             metadata={"source": "test3"},
             embedding=Embedding(
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
index 37b05c2e2409..318e529d20d6 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 import unittest
+
 from parameterized import parameterized
 
 try:
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
index d07edff83928..93968564f156 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
@@ -16,12 +16,13 @@
 
 import json
 from dataclasses import dataclass
-from typing import Any, Tuple
+from typing import Any
 from typing import Callable
 from typing import Dict
 from typing import List
 from typing import Literal
 from typing import Optional
+from typing import Tuple
 from typing import Type
 from typing import Union
 
@@ -30,16 +31,16 @@
 
 def chunk_embedding_fn(chunk: Chunk) -> str:
   """Convert embedding to PostgreSQL array string.
-    
+
     Formats dense embedding as a PostgreSQL-compatible array string.
     Example: [1.0, 2.0] -> '{1.0,2.0}'
-    
+
     Args:
         chunk: Input Chunk object.
-    
+
     Returns:
         str: PostgreSQL array string representation of the embedding.
-    
+
     Raises:
         ValueError: If chunk has no dense embedding.
     """
@@ -51,7 +52,7 @@ def chunk_embedding_fn(chunk: Chunk) -> str:
 @dataclass
 class ColumnSpec:
   """Specification for mapping Chunk fields to SQL columns for insertion.
-    
+
     Defines how to extract and format values from Chunks into database columns,
     handling the full pipeline from Python value to SQL insertion.
 
@@ -71,7 +72,7 @@ class ColumnSpec:
             Common examples:
             - "::float[]" for vector arrays
             - "::jsonb" for JSON data
-    
+
     Examples:
         Basic text column (uses standard JDBC type mapping):
         >>> ColumnSpec.text(
@@ -83,7 +84,7 @@ class ColumnSpec:
         Vector column with explicit array casting:
         >>> ColumnSpec.vector(
         ...     column_name="embedding",
-        ...     value_fn=lambda chunk: '{' + 
+        ...     value_fn=lambda chunk: '{' +
         ...         ','.join(map(str, chunk.embedding.dense_embedding)) + '}'
         ... )
         # Results in: INSERT INTO table (embedding) VALUES (?::float[])
@@ -168,17 +169,17 @@ def with_id_spec(
       convert_fn: Optional[Callable[[str], Any]] = None,
       sql_typecast: Optional[str] = None) -> 'ColumnSpecsBuilder':
     """Add ID :class:`.ColumnSpec` with optional type and conversion.
-        
+
         Args:
             column_name: Name for the ID column (defaults to "id")
             python_type: Python type for the column (defaults to str)
             convert_fn: Optional function to convert the chunk ID
                        If None, uses ID as-is
             sql_typecast: Optional SQL type cast
-        
+
         Returns:
             Self for method chaining
-        
+
         Example:
             >>> builder.with_id_spec(
             ...     column_name="doc_id",
@@ -205,17 +206,17 @@ def with_content_spec(
       convert_fn: Optional[Callable[[str], Any]] = None,
       sql_typecast: Optional[str] = None) -> 'ColumnSpecsBuilder':
     """Add content :class:`.ColumnSpec` with optional type and conversion.
-      
+
       Args:
           column_name: Name for the content column (defaults to "content")
           python_type: Python type for the column (defaults to str)
           convert_fn: Optional function to convert the content text
                       If None, uses content text as-is
           sql_typecast: Optional SQL type cast
-      
+
       Returns:
           Self for method chaining
-      
+
       Example:
           >>> builder.with_content_spec(
           ...     column_name="content_length",
@@ -244,17 +245,17 @@ def with_metadata_spec(
       convert_fn: Optional[Callable[[Dict[str, Any]], Any]] = None,
       sql_typecast: Optional[str] = "::jsonb") -> 'ColumnSpecsBuilder':
     """Add metadata :class:`.ColumnSpec` with optional type and conversion.
-      
+
       Args:
           column_name: Name for the metadata column (defaults to "metadata")
           python_type: Python type for the column (defaults to str)
           convert_fn: Optional function to convert the metadata dictionary
                       If None and python_type is str, converts to JSON string
           sql_typecast: Optional SQL type cast (defaults to "::jsonb")
-      
+
       Returns:
           Self for method chaining
-      
+
       Example:
           >>> builder.with_metadata_spec(
           ...     column_name="meta_tags",
@@ -283,19 +284,19 @@ def with_embedding_spec(
       convert_fn: Optional[Callable[[List[float]], Any]] = None
   ) -> 'ColumnSpecsBuilder':
     """Add embedding :class:`.ColumnSpec` with optional conversion.
-      
+
       Args:
           column_name: Name for the embedding column (defaults to "embedding")
           convert_fn: Optional function to convert the dense embedding values
                       If None, uses default PostgreSQL array format
-      
+
       Returns:
           Self for method chaining
-      
+
       Example:
           >>> builder.with_embedding_spec(
           ...     column_name="embedding_vector",
-          ...     convert_fn=lambda values: '{' + ','.join(f"{x:.4f}" 
+          ...     convert_fn=lambda values: '{' + ','.join(f"{x:.4f}"
           ...       for x in values) + '}'
           ... )
       """
@@ -366,7 +367,7 @@ def add_metadata_field(
                       desired type. If None, value is used as-is
             default: Default value if field is missing from metadata
             sql_typecast: Optional SQL type cast (e.g. "::timestamp")
-        
+
         Returns:
             Self for chaining
 
@@ -421,17 +422,17 @@ def value_fn(chunk: Chunk) -> Any:
 
   def add_custom_column_spec(self, spec: ColumnSpec) -> 'ColumnSpecsBuilder':
     """Add a custom :class:`.ColumnSpec` to the builder.
-    
+
     Use this method when you need complete control over the :class:`.ColumnSpec`
     , including custom value extraction and type handling.
-    
+
     Args:
         spec: A :class:`.ColumnSpec` instance defining the column name, type,
             value extraction, and optional SQL type casting.
-    
+
     Returns:
         Self for method chaining
-    
+
     Examples:
         Custom text column from chunk metadata:
 
@@ -466,12 +467,12 @@ class ConflictResolution:
             IGNORE: Skips conflicting records.
         update_fields: Optional list of fields to update on conflict. If None,
             all non-conflict fields are updated.
-        
+
     Examples:
         Simple primary key:
 
         >>> ConflictResolution("id")
-        
+
         Composite key with specific update fields:
 
         >>> ConflictResolution(
@@ -479,7 +480,7 @@ class ConflictResolution:
         ...     action="UPDATE",
         ...     update_fields=["embedding", "content"]
         ... )
-        
+
         Ignore conflicts:
 
         >>> ConflictResolution(
diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 325babdc7037..99251d3878f2 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -16,16 +16,18 @@
 #
 
 import contextlib
-from dataclasses import dataclass
+import logging
 import os
 import socket
 import tempfile
-import logging
-from typing import List, Optional
+from dataclasses import dataclass
+from typing import List
+from typing import Optional
+
+import yaml
 from testcontainers.core.config import testcontainers_config
 from testcontainers.core.generic import DbContainer
 from testcontainers.milvus import MilvusContainer
-import yaml
 
 from apache_beam.ml.rag.types import Chunk
 
diff --git a/sdks/python/apache_beam/ml/rag/utils.py b/sdks/python/apache_beam/ml/rag/utils.py
index ee841b84c99b..81112e55f64e 100644
--- a/sdks/python/apache_beam/ml/rag/utils.py
+++ b/sdks/python/apache_beam/ml/rag/utils.py
@@ -15,13 +15,20 @@
 # limitations under the License.
 #
 
-from collections import defaultdict
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple
-import uuid
 import re
-
-from apache_beam.ml.rag.types import Chunk, Content, Embedding
+import uuid
+from collections import defaultdict
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.rag.types import Content
+from apache_beam.ml.rag.types import Embedding
 
 # Default batch size for writing data to Milvus, matching
 # JdbcIO.DEFAULT_BATCH_SIZE.

From dab040ab9ec9095166a74244521398877a5632f8 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sat, 1 Nov 2025 21:21:57 +0000
Subject: [PATCH 15/35] sdks/python: fix formatting issues

---
 sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
index 15d91ceb7509..7cdad3dda119 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
@@ -37,7 +37,6 @@
 from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
 from apache_beam.transforms import DoFn
 
-
 _LOGGER = logging.getLogger(__name__)
 
 

From deef266db9890db22c637e81f10cdbc5aa0714c6 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sun, 2 Nov 2025 06:37:38 +0000
Subject: [PATCH 16/35] sdks/python: fix arising linting issue

---
 sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
index 318e529d20d6..ea80f2a8afcb 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
@@ -38,7 +38,7 @@ def test_empty_collection_name_raises_error(self):
   def test_none_collection_name_raises_error(self):
     """Test that None collection name raises ValueError."""
     with self.assertRaises(ValueError) as context:
-      MilvusWriteConfig(collection_name=None)  # type: ignore[arg-type]
+      MilvusWriteConfig(collection_name=None)
 
     self.assertIn("Collection name must be provided", str(context.exception))
 

From b4e31e8c4b38c049a35c50fc8ecdab93d768ee19 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sun, 2 Nov 2025 08:12:03 +0000
Subject: [PATCH 17/35] rag: reuse `retry_with_backoff` for one-time setup
 operations

---
 .../transforms/elementwise/enrichment_test.py |   7 +-
 .../ml/rag/enrichment/milvus_search.py        |  72 +++++----
 .../rag/enrichment/milvus_search_it_test.py   | 141 ++++--------------
 .../ml/rag/ingestion/milvus_search.py         |  22 ++-
 .../ml/rag/ingestion/milvus_search_it_test.py |  15 +-
 sdks/python/apache_beam/ml/rag/utils.py       |  85 +++++++++++
 6 files changed, 184 insertions(+), 158 deletions(-)

diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
index ffc077a14f8e..2452b748c83c 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
@@ -55,9 +55,8 @@
   from apache_beam.ml.rag.enrichment.milvus_search import (
       MilvusConnectionParameters)
   from apache_beam.ml.rag.enrichment.milvus_search_it_test import (
-      MilvusEnrichmentTestHelper,
-      MilvusDBContainerInfo,
-      assert_chunks_equivalent)
+      MilvusEnrichmentTestHelper, MilvusDBContainerInfo)
+  from apache_beam.ml.rag.test_utils import MilvusTestHelpers
   from apache_beam.ml.rag.utils import parse_chunk_strings
   from apache_beam.io.requestresponse import RequestResponseIO
 except ImportError as e:
@@ -231,7 +230,7 @@ def test_enrichment_with_milvus(self, mock_stdout):
         self.maxDiff = None
         output = parse_chunk_strings(output)
         expected = parse_chunk_strings(expected)
-        assert_chunks_equivalent(output, expected)
+        MilvusTestHelpers.assert_chunks_equivalent(output, expected)
     except (TestContainerStartupError, TestContainerTeardownError) as e:
       raise unittest.SkipTest(str(e))
     except Exception as e:
diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
index 867768423694..9b363a4f91ca 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+import logging
 from collections.abc import Sequence
 from dataclasses import dataclass
 from dataclasses import field
@@ -32,11 +33,14 @@
 from pymilvus import Hits
 from pymilvus import MilvusClient
 from pymilvus import SearchResult
+from pymilvus.exceptions import MilvusException
 
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Embedding
 from apache_beam.ml.rag.utils import MilvusConnectionParameters
 from apache_beam.ml.rag.utils import MilvusHelpers
+from apache_beam.ml.rag.utils import retry_with_backoff
+from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
 from apache_beam.transforms.enrichment import EnrichmentSourceHandler
 
 
@@ -371,51 +375,43 @@ def __init__(
         'min_batch_size': min_batch_size, 'max_batch_size': max_batch_size
     }
     self.kwargs = kwargs
+    self._client = None
     self.join_fn = join_fn
     self.use_custom_types = True
 
   def __enter__(self):
-    import time
-    import logging
-    from pymilvus.exceptions import MilvusException
-
-    connection_params = unpack_dataclass_with_kwargs(
-        self._connection_parameters)
-    collection_load_params = unpack_dataclass_with_kwargs(
-        self._collection_load_parameters)
-
-    # Extract retry parameters from connection_params
-    max_retries = connection_params.pop('max_retries', 3)
-    retry_delay = connection_params.pop('retry_delay', 1.0)
-    retry_backoff_factor = connection_params.pop('retry_backoff_factor', 2.0)
-
-    # Retry logic for MilvusClient connection
-    last_exception = None
-    for attempt in range(max_retries + 1):
-      try:
-        self._client = MilvusClient(**connection_params)
-        self._client.load_collection(
+    """Enters the context manager and establishes Milvus connection.
+
+    Returns:
+      Self, enabling use in 'with' statements.
+    """
+    if not self._client:
+      connection_params = unpack_dataclass_with_kwargs(
+          self._connection_parameters)
+      collection_load_params = unpack_dataclass_with_kwargs(
+          self._collection_load_parameters)
+
+      # Extract retry parameters from connection_params.
+      max_retries = connection_params.pop('max_retries', 3)
+      retry_delay = connection_params.pop('retry_delay', 1.0)
+      retry_backoff_factor = connection_params.pop('retry_backoff_factor', 2.0)
+
+      def connect_and_load():
+        client = MilvusClient(**connection_params)
+        client.load_collection(
             collection_name=self.collection_name,
             partition_names=self.partition_names,
             **collection_load_params)
-        logging.info(
-            "Successfully connected to Milvus on attempt %d", attempt + 1)
-        return
-      except MilvusException as e:
-        last_exception = e
-        if attempt < max_retries:
-          delay = retry_delay * (retry_backoff_factor**attempt)
-          logging.warning(
-              "Milvus connection attempt %d failed: %s. "
-              "Retrying in %.2f seconds...",
-              attempt + 1,
-              e,
-              delay)
-          time.sleep(delay)
-        else:
-          logging.error(
-              "Failed to connect to Milvus after %d attempts", max_retries + 1)
-          raise last_exception
+        return client
+
+      self._client = retry_with_backoff(
+          connect_and_load,
+          max_retries=max_retries,
+          retry_delay=retry_delay,
+          retry_backoff_factor=retry_backoff_factor,
+          operation_name="Milvus connection and collection load",
+          exception_types=(MilvusException, ))
+    return self
 
   def __call__(self, request: Union[Chunk, List[Chunk]], *args,
                **kwargs) -> List[Tuple[Chunk, Dict[str, Any]]]:
diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index ed6f52e004fa..540c59184afd 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -31,6 +31,7 @@
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Content
 from apache_beam.ml.rag.types import Embedding
+from apache_beam.ml.rag.utils import retry_with_backoff
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.util import assert_that
 
@@ -45,9 +46,10 @@
       MilvusClient,
       RRFRanker)
   from pymilvus.milvus_client import IndexParams
+  from pymilvus.exceptions import MilvusException
   from apache_beam.transforms.enrichment import Enrichment
-  from apache_beam.ml.rag.test_utils import (
-      MilvusTestHelpers, VectorDBContainerInfo)
+  from apache_beam.ml.rag.test_utils import MilvusTestHelpers
+  from apache_beam.ml.rag.test_utils import VectorDBContainerInfo
   from apache_beam.ml.rag.enrichment.milvus_search import (
       MilvusSearchEnrichmentHandler,
       MilvusConnectionParameters,
@@ -235,8 +237,16 @@ def __getitem__(self, key):
 
 
 def initialize_db_with_data(connc_params: MilvusConnectionParameters):
-  # Open the connection to the milvus db.
-  client = MilvusClient(**connc_params.__dict__)
+  # Open the connection to the milvus db with retry.
+  def create_client():
+    return MilvusClient(**connc_params.__dict__)
+
+  client = retry_with_backoff(
+      create_client,
+      max_retries=3,
+      retry_delay=1.0,
+      operation_name="Test Milvus client connection",
+      exception_types=(MilvusException, ))
 
   # Configure schema.
   field_schemas: List[FieldSchema] = cast(
@@ -403,8 +413,8 @@ def test_empty_input_chunks(self):
     with TestPipeline() as p:
       result = (p | beam.Create(test_chunks) | Enrichment(handler))
       assert_that(
-          result,
-          lambda actual: assert_chunks_equivalent(actual, expected_chunks))
+          result, lambda actual: MilvusTestHelpers.assert_chunks_equivalent(
+              actual, expected_chunks))
 
   def test_filtered_search_with_cosine_similarity_and_batching(self):
     test_chunks = [
@@ -531,8 +541,8 @@ def test_filtered_search_with_cosine_similarity_and_batching(self):
     with TestPipeline() as p:
       result = (p | beam.Create(test_chunks) | Enrichment(handler))
       assert_that(
-          result,
-          lambda actual: assert_chunks_equivalent(actual, expected_chunks))
+          result, lambda actual: MilvusTestHelpers.assert_chunks_equivalent(
+              actual, expected_chunks))
 
   def test_filtered_search_with_bm25_full_text_and_batching(self):
     test_chunks = [
@@ -636,8 +646,8 @@ def test_filtered_search_with_bm25_full_text_and_batching(self):
     with TestPipeline() as p:
       result = (p | beam.Create(test_chunks) | Enrichment(handler))
       assert_that(
-          result,
-          lambda actual: assert_chunks_equivalent(actual, expected_chunks))
+          result, lambda actual: MilvusTestHelpers.assert_chunks_equivalent(
+              actual, expected_chunks))
 
   def test_vector_search_with_euclidean_distance(self):
     test_chunks = [
@@ -777,8 +787,8 @@ def test_vector_search_with_euclidean_distance(self):
     with TestPipeline() as p:
       result = (p | beam.Create(test_chunks) | Enrichment(handler))
       assert_that(
-          result,
-          lambda actual: assert_chunks_equivalent(actual, expected_chunks))
+          result, lambda actual: MilvusTestHelpers.assert_chunks_equivalent(
+              actual, expected_chunks))
 
   def test_vector_search_with_inner_product_similarity(self):
     test_chunks = [
@@ -917,8 +927,8 @@ def test_vector_search_with_inner_product_similarity(self):
     with TestPipeline() as p:
       result = (p | beam.Create(test_chunks) | Enrichment(handler))
       assert_that(
-          result,
-          lambda actual: assert_chunks_equivalent(actual, expected_chunks))
+          result, lambda actual: MilvusTestHelpers.assert_chunks_equivalent(
+              actual, expected_chunks))
 
   def test_keyword_search_with_inner_product_sparse_embedding(self):
     test_chunks = [
@@ -982,8 +992,8 @@ def test_keyword_search_with_inner_product_sparse_embedding(self):
     with TestPipeline() as p:
       result = (p | beam.Create(test_chunks) | Enrichment(handler))
       assert_that(
-          result,
-          lambda actual: assert_chunks_equivalent(actual, expected_chunks))
+          result, lambda actual: MilvusTestHelpers.assert_chunks_equivalent(
+              actual, expected_chunks))
 
   def test_hybrid_search(self):
     test_chunks = [
@@ -1055,103 +1065,8 @@ def test_hybrid_search(self):
     with TestPipeline() as p:
       result = (p | beam.Create(test_chunks) | Enrichment(handler))
       assert_that(
-          result,
-          lambda actual: assert_chunks_equivalent(actual, expected_chunks))
-
-
-def assert_chunks_equivalent(
-    actual_chunks: List[Chunk], expected_chunks: List[Chunk]):
-  """assert_chunks_equivalent checks for presence rather than exact match"""
-  # Sort both lists by ID to ensure consistent ordering.
-  actual_sorted = sorted(actual_chunks, key=lambda c: c.id)
-  expected_sorted = sorted(expected_chunks, key=lambda c: c.id)
-
-  actual_len = len(actual_sorted)
-  expected_len = len(expected_sorted)
-  err_msg = (
-      f"Different number of chunks, actual: {actual_len}, "
-      f"expected: {expected_len}")
-  assert actual_len == expected_len, err_msg
-
-  for actual, expected in zip(actual_sorted, expected_sorted):
-    # Assert that IDs match.
-    assert actual.id == expected.id
-
-    # Assert that dense embeddings match.
-    err_msg = f"Dense embedding mismatch for chunk {actual.id}"
-    assert actual.dense_embedding == expected.dense_embedding, err_msg
-
-    # Assert that sparse embeddings match.
-    err_msg = f"Sparse embedding mismatch for chunk {actual.id}"
-    assert actual.sparse_embedding == expected.sparse_embedding, err_msg
-
-    # Assert that text content match.
-    err_msg = f"Text Content mismatch for chunk {actual.id}"
-    assert actual.content.text == expected.content.text, err_msg
-
-    # For enrichment_data, be more flexible.
-    # If "expected" has values for enrichment_data but actual doesn't, that's
-    # acceptable since vector search results can vary based on many factors
-    # including implementation details, vector database state, and slight
-    # variations in similarity calculations.
-
-    # First ensure the enrichment data key exists.
-    err_msg = f"Missing enrichment_data key in chunk {actual.id}"
-    assert 'enrichment_data' in actual.metadata, err_msg
-
-    # For enrichment_data, ensure consistent ordering of results.
-    actual_data = actual.metadata['enrichment_data']
-    expected_data = expected.metadata['enrichment_data']
-
-    # If actual has enrichment data, then perform detailed validation.
-    if actual_data and actual_data.get('id'):
-      # Validate IDs have consistent ordering.
-      actual_ids = sorted(actual_data['id'])
-      expected_ids = sorted(expected_data['id'])
-      err_msg = f"IDs in enrichment_data don't match for chunk {actual.id}"
-      assert actual_ids == expected_ids, err_msg
-
-      # Ensure the distance key exist.
-      err_msg = f"Missing distance key in metadata {actual.id}"
-      assert 'distance' in actual_data, err_msg
-
-      # Validate distances exist and have same length as IDs.
-      actual_distances = actual_data['distance']
-      expected_distances = expected_data['distance']
-      err_msg = (
-          "Number of distances doesn't match number of IDs for "
-          f"chunk {actual.id}")
-      assert len(actual_distances) == len(expected_distances), err_msg
-
-      # Ensure the fields key exist.
-      err_msg = f"Missing fields key in metadata {actual.id}"
-      assert 'fields' in actual_data, err_msg
-
-      # Validate fields have consistent content.
-      # Sort fields by 'id' to ensure consistent ordering.
-      actual_fields_sorted = sorted(
-          actual_data['fields'], key=lambda f: f.get('id', 0))
-      expected_fields_sorted = sorted(
-          expected_data['fields'], key=lambda f: f.get('id', 0))
-
-      # Compare field IDs.
-      actual_field_ids = [f.get('id') for f in actual_fields_sorted]
-      expected_field_ids = [f.get('id') for f in expected_fields_sorted]
-      err_msg = f"Field IDs don't match for chunk {actual.id}"
-      assert actual_field_ids == expected_field_ids, err_msg
-
-      # Compare field content.
-      for a_f, e_f in zip(actual_fields_sorted, expected_fields_sorted):
-        # Ensure the id key exist.
-        err_msg = f"Missing id key in metadata.fields {actual.id}"
-        assert 'id' in a_f
-
-        err_msg = f"Field ID mismatch chunk {actual.id}"
-        assert a_f['id'] == e_f['id'], err_msg
-
-        # Validate field metadata.
-        err_msg = f"Field Metadata doesn't match for chunk {actual.id}"
-        assert a_f['metadata'] == e_f['metadata'], err_msg
+          result, lambda actual: MilvusTestHelpers.assert_chunks_equivalent(
+              actual, expected_chunks))
 
 
 if __name__ == '__main__':
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
index 7cdad3dda119..629f50747059 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
 import logging
 from dataclasses import dataclass
 from dataclasses import field
@@ -24,6 +25,7 @@
 from typing import Optional
 
 from pymilvus import MilvusClient
+from pymilvus.exceptions import MilvusException
 
 import apache_beam as beam
 from apache_beam.ml.rag.ingestion.base import VectorDatabaseWriteConfig
@@ -34,6 +36,7 @@
 from apache_beam.ml.rag.utils import DEFAULT_WRITE_BATCH_SIZE
 from apache_beam.ml.rag.utils import MilvusConnectionParameters
 from apache_beam.ml.rag.utils import MilvusHelpers
+from apache_beam.ml.rag.utils import retry_with_backoff
 from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
 from apache_beam.transforms import DoFn
 
@@ -325,8 +328,23 @@ def __enter__(self):
       Self, enabling use in 'with' statements.
     """
     if not self._client:
-      self._client = MilvusClient(
-          **unpack_dataclass_with_kwargs(self._connection_params))
+      connection_params = unpack_dataclass_with_kwargs(self._connection_params)
+
+      # Extract retry parameters from connection_params.
+      max_retries = connection_params.pop('max_retries', 3)
+      retry_delay = connection_params.pop('retry_delay', 1.0)
+      retry_backoff_factor = connection_params.pop('retry_backoff_factor', 2.0)
+
+      def create_client():
+        return MilvusClient(**connection_params)
+
+      self._client = retry_with_backoff(
+          create_client,
+          max_retries=max_retries,
+          retry_delay=retry_delay,
+          retry_backoff_factor=retry_backoff_factor,
+          operation_name="Milvus connection",
+          exception_types=(MilvusException, ))
     return self
 
   def __exit__(self, exc_type, exc_val, exc_tb):
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
index 083a2f76e664..f7eb7e7dc55d 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -27,6 +27,7 @@
 from pymilvus import FieldSchema
 from pymilvus import MilvusClient
 from pymilvus.milvus_client import IndexParams
+from pymilvus.exceptions import MilvusException
 
 import apache_beam as beam
 from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
@@ -36,6 +37,7 @@
 from apache_beam.ml.rag.types import Content
 from apache_beam.ml.rag.types import Embedding
 from apache_beam.ml.rag.utils import MilvusConnectionParameters
+from apache_beam.ml.rag.utils import retry_with_backoff
 from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
 from apache_beam.testing.test_pipeline import TestPipeline
 
@@ -190,7 +192,18 @@ def setUp(self):
     self._partition_name = f"test_partition_{self._testMethodName}"
     config = unpack_dataclass_with_kwargs(self._connection_config)
     config["alias"] = f"milvus_conn_{uuid.uuid4().hex[:8]}"
-    self._test_client = MilvusClient(**config)
+
+    # Use retry_with_backoff for test client connection.
+    def create_client():
+      return MilvusClient(**config)
+
+    self._test_client = retry_with_backoff(
+        create_client,
+        max_retries=3,
+        retry_delay=1.0,
+        operation_name="Test Milvus client connection",
+        exception_types=(MilvusException, ))
+
     create_collection_with_partition(
         self._test_client, self._collection_name, self._partition_name)
 
diff --git a/sdks/python/apache_beam/ml/rag/utils.py b/sdks/python/apache_beam/ml/rag/utils.py
index 81112e55f64e..c9493892bd48 100644
--- a/sdks/python/apache_beam/ml/rag/utils.py
+++ b/sdks/python/apache_beam/ml/rag/utils.py
@@ -15,12 +15,15 @@
 # limitations under the License.
 #
 
+import logging
 import re
+import time
 import uuid
 from collections import defaultdict
 from dataclasses import dataclass
 from dataclasses import field
 from typing import Any
+from typing import Callable
 from typing import Dict
 from typing import List
 from typing import Optional
@@ -30,6 +33,8 @@
 from apache_beam.ml.rag.types import Content
 from apache_beam.ml.rag.types import Embedding
 
+_LOGGER = logging.getLogger(__name__)
+
 # Default batch size for writing data to Milvus, matching
 # JdbcIO.DEFAULT_BATCH_SIZE.
 DEFAULT_WRITE_BATCH_SIZE = 1000
@@ -135,3 +140,83 @@ def unpack_dataclass_with_kwargs(dataclass_instance):
   # Merge the dictionaries, with nested_kwargs taking precedence
   # in case of duplicate keys.
   return {**params_dict, **nested_kwargs}
+
+
+def retry_with_backoff(
+    operation: Callable[[], Any],
+    max_retries: int = 3,
+    retry_delay: float = 1.0,
+    retry_backoff_factor: float = 2.0,
+    operation_name: str = "operation",
+    exception_types: Tuple[type, ...] = (Exception, )
+) -> Any:
+  """Executes an operation with retry logic and exponential backoff.
+
+  This is a generic retry utility that can be used for any operation that may
+  fail transiently. It retries the operation with exponential backoff between
+  attempts.
+
+  Note:
+    This utility is designed for one-time setup operations and complements
+    Apache Beam's RequestResponseIO pattern. Use retry_with_backoff() for:
+    - Establishing client connections in __enter__() methods (e.g., creating
+      MilvusClient instances, database connections) before processing elements
+    - One-time setup/teardown operations in DoFn lifecycle methods
+    - Operations outside of per-element processing where retry is needed
+
+    For per-element operations (e.g., API calls within Caller.__call__),
+    use RequestResponseIO which already provides automatic retry with
+    exponential backoff, failure handling, caching, and other features.
+    See: https://beam.apache.org/documentation/io/built-in/webapis/
+
+  Args:
+    operation: Callable that performs the operation to retry. Should return
+      the result of the operation.
+    max_retries: Maximum number of retry attempts. Default is 3.
+    retry_delay: Initial delay in seconds between retries. Default is 1.0.
+    retry_backoff_factor: Multiplier for the delay after each retry. Default
+      is 2.0 (exponential backoff).
+    operation_name: Name of the operation for logging purposes. Default is
+      "operation".
+    exception_types: Tuple of exception types to catch and retry. Default is
+      (Exception,) which catches all exceptions.
+
+  Returns:
+    The result of the operation if successful.
+
+  Raises:
+    The last exception encountered if all retry attempts fail.
+
+  Example:
+    >>> def connect_to_service():
+    ...     return service.connect(host="localhost")
+    >>> client = retry_with_backoff(
+    ...     connect_to_service,
+    ...     max_retries=5,
+    ...     retry_delay=2.0,
+    ...     operation_name="service connection")
+  """
+  last_exception = None
+  for attempt in range(max_retries + 1):
+    try:
+      result = operation()
+      _LOGGER.info(
+          "Successfully completed %s on attempt %d",
+          operation_name,
+          attempt + 1)
+      return result
+    except exception_types as e:
+      last_exception = e
+      if attempt < max_retries:
+        delay = retry_delay * (retry_backoff_factor**attempt)
+        _LOGGER.warning(
+            "%s attempt %d failed: %s. Retrying in %.2f seconds...",
+            operation_name,
+            attempt + 1,
+            e,
+            delay)
+        time.sleep(delay)
+      else:
+        _LOGGER.error(
+            "Failed %s after %d attempts", operation_name, max_retries + 1)
+        raise last_exception

From 795ed60400e8a2205f5d4a71813eb4f2e0023f8c Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sun, 2 Nov 2025 09:43:00 +0000
Subject: [PATCH 18/35] sdks/python: fix linting issues

---
 .../apache_beam/ml/rag/enrichment/milvus_search.py   | 12 ------------
 sdks/python/apache_beam/ml/rag/utils.py              |  3 ++-
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
index 9b363a4f91ca..50060da3bcda 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
@@ -584,15 +584,3 @@ def batch_elements_kwargs(self) -> Dict[str, int]:
 def join_fn(left: Embedding, right: Dict[str, Any]) -> Embedding:
   left.metadata['enrichment_data'] = right
   return left
-
-
-def unpack_dataclass_with_kwargs(dataclass_instance):
-  # Create a copy of the dataclass's __dict__.
-  params_dict: dict = dataclass_instance.__dict__.copy()
-
-  # Extract the nested kwargs dictionary.
-  nested_kwargs = params_dict.pop('kwargs', {})
-
-  # Merge the dictionaries, with nested_kwargs taking precedence
-  # in case of duplicate keys.
-  return {**params_dict, **nested_kwargs}
diff --git a/sdks/python/apache_beam/ml/rag/utils.py b/sdks/python/apache_beam/ml/rag/utils.py
index c9493892bd48..0737cdac201a 100644
--- a/sdks/python/apache_beam/ml/rag/utils.py
+++ b/sdks/python/apache_beam/ml/rag/utils.py
@@ -28,6 +28,7 @@
 from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Type
 
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Content
@@ -148,7 +149,7 @@ def retry_with_backoff(
     retry_delay: float = 1.0,
     retry_backoff_factor: float = 2.0,
     operation_name: str = "operation",
-    exception_types: Tuple[type, ...] = (Exception, )
+    exception_types: Tuple[Type[BaseException], ...] = (Exception, )
 ) -> Any:
   """Executes an operation with retry logic and exponential backoff.
 

From 9d35585c2d7b8c24e8650827376bdd187676e25d Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sun, 2 Nov 2025 09:45:35 +0000
Subject: [PATCH 19/35] sdks/python: fix py docs CI issue

---
 sdks/python/apache_beam/ml/rag/utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/utils.py b/sdks/python/apache_beam/ml/rag/utils.py
index 0737cdac201a..d45e99be0ecb 100644
--- a/sdks/python/apache_beam/ml/rag/utils.py
+++ b/sdks/python/apache_beam/ml/rag/utils.py
@@ -160,10 +160,11 @@ def retry_with_backoff(
   Note:
     This utility is designed for one-time setup operations and complements
     Apache Beam's RequestResponseIO pattern. Use retry_with_backoff() for:
-    - Establishing client connections in __enter__() methods (e.g., creating
+
+    * Establishing client connections in __enter__() methods (e.g., creating
       MilvusClient instances, database connections) before processing elements
-    - One-time setup/teardown operations in DoFn lifecycle methods
-    - Operations outside of per-element processing where retry is needed
+    * One-time setup/teardown operations in DoFn lifecycle methods
+    * Operations outside of per-element processing where retry is needed
 
     For per-element operations (e.g., API calls within Caller.__call__),
     use RequestResponseIO which already provides automatic retry with

From 119108f44233bc504975b613333f440f560d2659 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sun, 2 Nov 2025 14:59:41 +0000
Subject: [PATCH 20/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py | 1 -
 sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
index 50060da3bcda..41355e8c10aa 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-import logging
 from collections.abc import Sequence
 from dataclasses import dataclass
 from dataclasses import field
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
index 629f50747059..e019a03d7514 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
 import logging
 from dataclasses import dataclass
 from dataclasses import field

From cfc44f65848f36efc297e229c8e4bbaf09bf490e Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Sun, 2 Nov 2025 16:46:08 +0000
Subject: [PATCH 21/35] sdks/python: fix linting issues

---
 .../apache_beam/ml/rag/enrichment/milvus_search_it_test.py      | 2 +-
 .../apache_beam/ml/rag/ingestion/milvus_search_it_test.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index 540c59184afd..0354dfd8ad4d 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -45,8 +45,8 @@
       FunctionType,
       MilvusClient,
       RRFRanker)
-  from pymilvus.milvus_client import IndexParams
   from pymilvus.exceptions import MilvusException
+  from pymilvus.milvus_client import IndexParams
   from apache_beam.transforms.enrichment import Enrichment
   from apache_beam.ml.rag.test_utils import MilvusTestHelpers
   from apache_beam.ml.rag.test_utils import VectorDBContainerInfo
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
index f7eb7e7dc55d..2c966640dde1 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
@@ -26,8 +26,8 @@
 from pymilvus import DataType
 from pymilvus import FieldSchema
 from pymilvus import MilvusClient
-from pymilvus.milvus_client import IndexParams
 from pymilvus.exceptions import MilvusException
+from pymilvus.milvus_client import IndexParams
 
 import apache_beam as beam
 from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig

From 599c7f4b0418e3631f8e97d2bfb617cfa0ed449c Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Wed, 5 Nov 2025 06:15:16 +0000
Subject: [PATCH 22/35] sdks/python: isolate milvus sink integration to be in
 follow-up PR

---
 .../ml/rag/ingestion/milvus_search.py         | 359 ----------
 .../ml/rag/ingestion/milvus_search_it_test.py | 642 ------------------
 .../ml/rag/ingestion/milvus_search_test.py    | 123 ----
 3 files changed, 1124 deletions(-)
 delete mode 100644 sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
 delete mode 100644 sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
 delete mode 100644 sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
deleted file mode 100644
index e019a03d7514..000000000000
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search.py
+++ /dev/null
@@ -1,359 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from dataclasses import dataclass
-from dataclasses import field
-from typing import Any
-from typing import Callable
-from typing import Dict
-from typing import List
-from typing import Optional
-
-from pymilvus import MilvusClient
-from pymilvus.exceptions import MilvusException
-
-import apache_beam as beam
-from apache_beam.ml.rag.ingestion.base import VectorDatabaseWriteConfig
-from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
-from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpec
-from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpecsBuilder
-from apache_beam.ml.rag.types import Chunk
-from apache_beam.ml.rag.utils import DEFAULT_WRITE_BATCH_SIZE
-from apache_beam.ml.rag.utils import MilvusConnectionParameters
-from apache_beam.ml.rag.utils import MilvusHelpers
-from apache_beam.ml.rag.utils import retry_with_backoff
-from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
-from apache_beam.transforms import DoFn
-
-_LOGGER = logging.getLogger(__name__)
-
-
-@dataclass
-class MilvusWriteConfig:
-  """Configuration parameters for writing data to Milvus collections.
-
-  This class defines the parameters needed to write data to a Milvus collection,
-  including collection targeting, batching behavior, and operation timeouts.
-
-  Args:
-    collection_name: Name of the target Milvus collection to write data to.
-      Must be a non-empty string.
-    partition_name: Name of the specific partition within the collection to
-      write to. If empty, writes to the default partition.
-    timeout: Maximum time in seconds to wait for write operations to complete.
-      If None, uses the client's default timeout.
-    write_config: Configuration for write operations including batch size and
-      other write-specific settings.
-    kwargs: Additional keyword arguments for write operations. Enables forward
-      compatibility with future Milvus client parameters.
-  """
-  collection_name: str
-  partition_name: str = ""
-  timeout: Optional[float] = None
-  write_config: WriteConfig = field(default_factory=WriteConfig)
-  kwargs: Dict[str, Any] = field(default_factory=dict)
-
-  def __post_init__(self):
-    if not self.collection_name:
-      raise ValueError("Collection name must be provided")
-
-  @property
-  def write_batch_size(self):
-    """Returns the batch size for write operations.
-
-    Returns:
-      The configured batch size, or DEFAULT_WRITE_BATCH_SIZE if not specified.
-    """
-    return self.write_config.write_batch_size or DEFAULT_WRITE_BATCH_SIZE
-
-
-@dataclass
-class MilvusVectorWriterConfig(VectorDatabaseWriteConfig):
-  """Configuration for writing vector data to Milvus collections.
-
-  This class extends VectorDatabaseWriteConfig to provide Milvus-specific
-  configuration for ingesting vector embeddings and associated metadata.
-  It defines how Apache Beam chunks are converted to Milvus records and
-  handles the write operation parameters.
-
-  The configuration includes connection parameters, write settings, and
-  column specifications that determine how chunk data is mapped to Milvus
-  fields.
-
-  Args:
-    connection_params: Configuration for connecting to the Milvus server,
-      including URI, credentials, and connection options.
-    write_config: Configuration for write operations including collection name,
-      partition, batch size, and timeouts.
-    column_specs: List of column specifications defining how chunk fields are
-      mapped to Milvus collection fields. Defaults to standard RAG fields
-      (id, embedding, sparse_embedding, content, metadata).
-
-  Example:
-    config = MilvusVectorWriterConfig(
-      connection_params=MilvusConnectionParameters(
-        uri="http://localhost:19530"),
-      write_config=MilvusWriteConfig(collection_name="my_collection"),
-      column_specs=MilvusVectorWriterConfig.default_column_specs())
-  """
-  connection_params: MilvusConnectionParameters
-  write_config: MilvusWriteConfig
-  column_specs: List[ColumnSpec] = field(
-      default_factory=lambda: MilvusVectorWriterConfig.default_column_specs())
-
-  def create_converter(self) -> Callable[[Chunk], Dict[str, Any]]:
-    """Creates a function to convert Apache Beam Chunks to Milvus records.
-
-    Returns:
-      A function that takes a Chunk and returns a dictionary representing
-      a Milvus record with fields mapped according to column_specs.
-    """
-    def convert(chunk: Chunk) -> Dict[str, Any]:
-      result = {}
-      for col in self.column_specs:
-        result[col.column_name] = col.value_fn(chunk)
-      return result
-
-    return convert
-
-  def create_write_transform(self) -> beam.PTransform:
-    """Creates the Apache Beam transform for writing to Milvus.
-
-    Returns:
-      A PTransform that can be applied to a PCollection of Chunks to write
-      them to the configured Milvus collection.
-    """
-    return _WriteToMilvusVectorDatabase(self)
-
-  @staticmethod
-  def default_column_specs() -> List[ColumnSpec]:
-    """Returns default column specifications for RAG use cases.
-
-    Creates column mappings for standard RAG fields: id, dense embedding,
-    sparse embedding, content text, and metadata. These specifications
-    define how Chunk fields are converted to Milvus-compatible formats.
-
-    Returns:
-      List of ColumnSpec objects defining the default field mappings.
-    """
-    column_specs = ColumnSpecsBuilder()
-    return column_specs\
-      .with_id_spec()\
-      .with_embedding_spec(convert_fn=lambda values: list(values))\
-      .with_sparse_embedding_spec(conv_fn=MilvusHelpers.sparse_embedding)\
-      .with_content_spec()\
-      .with_metadata_spec(convert_fn=lambda values: dict(values))\
-      .build()
-
-
-class _WriteToMilvusVectorDatabase(beam.PTransform):
-  """Apache Beam PTransform for writing vector data to Milvus.
-
-  This transform handles the conversion of Apache Beam Chunks to Milvus records
-  and coordinates the write operations. It applies the configured converter
-  function and uses a DoFn for batched writes to optimize performance.
-
-  Args:
-    config: MilvusVectorWriterConfig containing all necessary parameters for
-      the write operation.
-  """
-  def __init__(self, config: MilvusVectorWriterConfig):
-    self.config = config
-
-  def expand(self, pcoll: beam.PCollection[Chunk]):
-    """Expands the PTransform to convert chunks and write to Milvus.
-
-    Args:
-      pcoll: PCollection of Chunk objects to write to Milvus.
-
-    Returns:
-      PCollection of the same Chunk objects after writing to Milvus.
-    """
-    return (
-        pcoll
-        | "Convert to Records" >> beam.Map(self.config.create_converter())
-        | beam.ParDo(
-            _WriteMilvusFn(
-                self.config.connection_params, self.config.write_config)))
-
-
-class _WriteMilvusFn(DoFn):
-  """DoFn that handles batched writes to Milvus.
-
-  This DoFn accumulates records in batches and flushes them to Milvus when
-  the batch size is reached or when the bundle finishes. This approach
-  optimizes performance by reducing the number of individual write operations.
-
-  Args:
-    connection_params: Configuration for connecting to the Milvus server.
-    write_config: Configuration for write operations including batch size
-      and collection details.
-  """
-  def __init__(
-      self,
-      connection_params: MilvusConnectionParameters,
-      write_config: MilvusWriteConfig):
-    self._connection_params = connection_params
-    self._write_config = write_config
-    self.batch = []
-
-  def process(self, element, *args, **kwargs):
-    """Processes individual records, batching them for efficient writes.
-
-    Args:
-      element: A dictionary representing a Milvus record to write.
-      *args: Additional positional arguments.
-      **kwargs: Additional keyword arguments.
-
-    Yields:
-      The original element after adding it to the batch.
-    """
-    _ = args, kwargs  # Unused parameters
-    self.batch.append(element)
-    if len(self.batch) >= self._write_config.write_batch_size:
-      self._flush()
-    yield element
-
-  def finish_bundle(self):
-    """Called when a bundle finishes processing.
-
-    Flushes any remaining records in the batch to ensure all data is written.
-    """
-    self._flush()
-
-  def _flush(self):
-    """Flushes the current batch of records to Milvus.
-
-    Creates a MilvusSink connection and writes all batched records,
-    then clears the batch for the next set of records.
-    """
-    if len(self.batch) == 0:
-      return
-    with _MilvusSink(self._connection_params, self._write_config) as sink:
-      sink.write(self.batch)
-      self.batch = []
-
-  def display_data(self):
-    """Returns display data for monitoring and debugging.
-
-    Returns:
-      Dictionary containing database, collection, and batch size information
-      for display in the Apache Beam monitoring UI.
-    """
-    res = super().display_data()
-    res["database"] = self._connection_params.db_name
-    res["collection"] = self._write_config.collection_name
-    res["batch_size"] = self._write_config.write_batch_size
-    return res
-
-
-class _MilvusSink:
-  """Low-level sink for writing data directly to Milvus.
-
-  This class handles the direct interaction with the Milvus client for
-  upsert operations. It manages the connection lifecycle and provides
-  context manager support for proper resource cleanup.
-
-  Args:
-    connection_params: Configuration for connecting to the Milvus server.
-    write_config: Configuration for write operations including collection
-      and partition targeting.
-  """
-  def __init__(
-      self,
-      connection_params: MilvusConnectionParameters,
-      write_config: MilvusWriteConfig):
-    self._connection_params = connection_params
-    self._write_config = write_config
-    self._client = None
-
-  def write(self, documents):
-    """Writes a batch of documents to the Milvus collection.
-
-    Performs an upsert operation to insert new documents or update existing
-    ones based on primary key. After the upsert, flushes the collection to
-    ensure data persistence.
-
-    Args:
-      documents: List of dictionaries representing Milvus records to write.
-        Each dictionary should contain fields matching the collection schema.
-    """
-    if not self._client:
-      self._client = MilvusClient(
-          **unpack_dataclass_with_kwargs(self._connection_params))
-
-    try:
-      resp = self._client.upsert(
-          collection_name=self._write_config.collection_name,
-          partition_name=self._write_config.partition_name,
-          data=documents,
-          timeout=self._write_config.timeout,
-          **self._write_config.kwargs)
-
-      # Try to flush, but handle connection issues gracefully.
-      try:
-        self._client.flush(self._write_config.collection_name)
-      except Exception as e:
-        # If flush fails due to connection issues, log but don't fail the write.
-        _LOGGER.warning(
-            "Flush operation failed, but upsert was successful: %s", e)
-
-      _LOGGER.debug(
-          "Upserted into Milvus: upsert_count=%d, cost=%d",
-          resp.get("upsert_count", 0),
-          resp.get("cost", 0))
-    except Exception as e:
-      _LOGGER.error("Failed to write to Milvus: %s", e)
-      raise
-
-  def __enter__(self):
-    """Enters the context manager and establishes Milvus connection.
-
-    Returns:
-      Self, enabling use in 'with' statements.
-    """
-    if not self._client:
-      connection_params = unpack_dataclass_with_kwargs(self._connection_params)
-
-      # Extract retry parameters from connection_params.
-      max_retries = connection_params.pop('max_retries', 3)
-      retry_delay = connection_params.pop('retry_delay', 1.0)
-      retry_backoff_factor = connection_params.pop('retry_backoff_factor', 2.0)
-
-      def create_client():
-        return MilvusClient(**connection_params)
-
-      self._client = retry_with_backoff(
-          create_client,
-          max_retries=max_retries,
-          retry_delay=retry_delay,
-          retry_backoff_factor=retry_backoff_factor,
-          operation_name="Milvus connection",
-          exception_types=(MilvusException, ))
-    return self
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    """Exits the context manager and closes the Milvus connection.
-
-    Args:
-      exc_type: Exception type if an exception was raised.
-      exc_val: Exception value if an exception was raised.
-      exc_tb: Exception traceback if an exception was raised.
-    """
-    _ = exc_type, exc_val, exc_tb  # Unused parameters
-    if self._client:
-      self._client.close()
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
deleted file mode 100644
index 2c966640dde1..000000000000
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_it_test.py
+++ /dev/null
@@ -1,642 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import platform
-import unittest
-import uuid
-from typing import Callable
-from typing import cast
-
-import pytest
-from pymilvus import CollectionSchema
-from pymilvus import DataType
-from pymilvus import FieldSchema
-from pymilvus import MilvusClient
-from pymilvus.exceptions import MilvusException
-from pymilvus.milvus_client import IndexParams
-
-import apache_beam as beam
-from apache_beam.ml.rag.ingestion.jdbc_common import WriteConfig
-from apache_beam.ml.rag.test_utils import MilvusTestHelpers
-from apache_beam.ml.rag.test_utils import VectorDBContainerInfo
-from apache_beam.ml.rag.types import Chunk
-from apache_beam.ml.rag.types import Content
-from apache_beam.ml.rag.types import Embedding
-from apache_beam.ml.rag.utils import MilvusConnectionParameters
-from apache_beam.ml.rag.utils import retry_with_backoff
-from apache_beam.ml.rag.utils import unpack_dataclass_with_kwargs
-from apache_beam.testing.test_pipeline import TestPipeline
-
-try:
-  from apache_beam.ml.rag.ingestion.milvus_search import (
-      MilvusWriteConfig, MilvusVectorWriterConfig)
-except ImportError as e:
-  raise unittest.SkipTest(f'Milvus dependencies not installed: {str(e)}')
-
-
-def _construct_index_params():
-  index_params = IndexParams()
-
-  # Dense vector index for dense embeddings.
-  index_params.add_index(
-      field_name="embedding",
-      index_name="embedding_ivf_flat",
-      index_type="IVF_FLAT",
-      metric_type="COSINE",
-      params={"nlist": 1})
-
-  # Sparse vector index for sparse embeddings.
-  index_params.add_index(
-      field_name="sparse_embedding",
-      index_name="sparse_embedding_inverted_index",
-      index_type="SPARSE_INVERTED_INDEX",
-      metric_type="IP",
-      params={"inverted_index_algo": "TAAT_NAIVE"})
-
-  return index_params
-
-
-MILVUS_INGESTION_IT_CONFIG = {
-    "fields": [
-        FieldSchema(
-            name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
-        FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=1000),
-        FieldSchema(name="metadata", dtype=DataType.JSON),
-        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=3),
-        FieldSchema(
-            name="sparse_embedding", dtype=DataType.SPARSE_FLOAT_VECTOR)
-    ],
-    "index": _construct_index_params,
-    "corpus": [
-        Chunk(
-            id=1,  # type: ignore[arg-type]
-            content=Content(text="Test document one"),
-            metadata={"source": "test1"},
-            embedding=Embedding(
-                dense_embedding=[0.1, 0.2, 0.3],
-                sparse_embedding=([1, 2], [0.1, 0.2])),
-        ),
-        Chunk(
-            id=2,  # type: ignore[arg-type]
-            content=Content(text="Test document two"),
-            metadata={"source": "test2"},
-            embedding=Embedding(
-                dense_embedding=[0.2, 0.3, 0.4],
-                sparse_embedding=([2, 3], [0.3, 0.1]),
-            ),
-        ),
-        Chunk(
-            id=3,  # type: ignore[arg-type]
-            content=Content(text="Test document three"),
-            metadata={"source": "test3"},
-            embedding=Embedding(
-                dense_embedding=[0.3, 0.4, 0.5],
-                sparse_embedding=([3, 4], [0.4, 0.2]),
-            ),
-        )
-    ]
-}
-
-
-def create_collection_with_partition(
-    client: MilvusClient,
-    collection_name: str,
-    partition_name: str = '',
-    fields=None):
-
-  if fields is None:
-    fields = MILVUS_INGESTION_IT_CONFIG["fields"]
-
-  # Configure schema.
-  schema = CollectionSchema(fields=fields)
-
-  # Configure index.
-  index_function: Callable[[], IndexParams] = cast(
-      Callable[[], IndexParams], MILVUS_INGESTION_IT_CONFIG["index"])
-
-  # Create collection with schema.
-  client.create_collection(
-      collection_name=collection_name,
-      schema=schema,
-      index_params=index_function())
-
-  # Create partition within the collection.
-  client.create_partition(
-      collection_name=collection_name, partition_name=partition_name)
-
-  msg = f"Expected collection '{collection_name}' to be created."
-  assert client.has_collection(collection_name), msg
-
-  msg = f"Expected partition '{partition_name}' to be created."
-  assert client.has_partition(collection_name, partition_name), msg
-
-  # Release the collection from memory. We don't need that on pure writing.
-  client.release_collection(collection_name)
-
-
-def drop_collection(client: MilvusClient, collection_name: str):
-  try:
-    client.drop_collection(collection_name)
-    assert not client.has_collection(collection_name)
-  except Exception:
-    # Silently ignore connection errors during cleanup.
-    pass
-
-
-@pytest.mark.require_docker_in_docker
-@unittest.skipUnless(
-    platform.system() == "Linux",
-    "Test runs only on Linux due to lack of support, as yet, for nested "
-    "virtualization in CI environments on Windows/macOS. Many CI providers run "
-    "tests in virtualized environments, and nested virtualization "
-    "(Docker inside a VM) is either unavailable or has several issues on "
-    "non-Linux platforms.")
-class TestMilvusVectorWriterConfig(unittest.TestCase):
-  """Integration tests for Milvus vector database ingestion functionality"""
-
-  _db: VectorDBContainerInfo
-
-  @classmethod
-  def setUpClass(cls):
-    cls._db = MilvusTestHelpers.start_db_container()
-    cls._connection_config = MilvusConnectionParameters(
-        uri=cls._db.uri,
-        user=cls._db.user,
-        password=cls._db.password,
-        db_name=cls._db.id,
-        token=cls._db.token)
-
-  @classmethod
-  def tearDownClass(cls):
-    MilvusTestHelpers.stop_db_container(cls._db)
-    cls._db = None
-
-  def setUp(self):
-    self.write_test_pipeline = TestPipeline()
-    self.write_test_pipeline.not_use_test_runner_api = True
-    self._collection_name = f"test_collection_{self._testMethodName}"
-    self._partition_name = f"test_partition_{self._testMethodName}"
-    config = unpack_dataclass_with_kwargs(self._connection_config)
-    config["alias"] = f"milvus_conn_{uuid.uuid4().hex[:8]}"
-
-    # Use retry_with_backoff for test client connection.
-    def create_client():
-      return MilvusClient(**config)
-
-    self._test_client = retry_with_backoff(
-        create_client,
-        max_retries=3,
-        retry_delay=1.0,
-        operation_name="Test Milvus client connection",
-        exception_types=(MilvusException, ))
-
-    create_collection_with_partition(
-        self._test_client, self._collection_name, self._partition_name)
-
-  def tearDown(self):
-    drop_collection(self._test_client, self._collection_name)
-    self._test_client.close()
-
-  def test_invalid_write_on_non_existent_collection(self):
-    non_existent_collection = "nonexistent_collection"
-
-    test_chunks = MILVUS_INGESTION_IT_CONFIG["corpus"]
-
-    write_config = MilvusWriteConfig(
-        collection_name=non_existent_collection,
-        write_config=WriteConfig(write_batch_size=1))
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config,
-        write_config=write_config,
-    )
-
-    # Write pipeline.
-    with self.assertRaises(Exception) as context:
-      with TestPipeline() as p:
-        _ = (p | beam.Create(test_chunks) | config.create_write_transform())
-
-    # Assert on what should happen.
-    self.assertIn("can't find collection", str(context.exception).lower())
-
-  def test_invalid_write_on_non_existent_partition(self):
-    non_existent_partition = "nonexistent_partition"
-
-    test_chunks = MILVUS_INGESTION_IT_CONFIG["corpus"]
-
-    write_config = MilvusWriteConfig(
-        collection_name=self._collection_name,
-        partition_name=non_existent_partition,
-        write_config=WriteConfig(write_batch_size=1))
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config, write_config=write_config)
-
-    # Write pipeline.
-    with self.assertRaises(Exception) as context:
-      with TestPipeline() as p:
-        _ = (p | beam.Create(test_chunks) | config.create_write_transform())
-
-    # Assert on what should happen.
-    self.assertIn("partition not found", str(context.exception).lower())
-
-  def test_invalid_write_on_missing_primary_key_in_entity(self):
-    test_chunks = [
-        Chunk(
-            content=Content(text="Test content without ID"),
-            embedding=Embedding(
-                dense_embedding=[0.1, 0.2, 0.3],
-                sparse_embedding=([1, 2], [0.1, 0.2])),
-            metadata={"source": "test"})
-    ]
-
-    write_config = MilvusWriteConfig(
-        collection_name=self._collection_name,
-        partition_name=self._partition_name,
-        write_config=WriteConfig(write_batch_size=1))
-
-    # Deliberately remove id primary key from the entity.
-    specs = MilvusVectorWriterConfig.default_column_specs()
-    for i, spec in enumerate(specs):
-      if spec.column_name == "id":
-        del specs[i]
-        break
-
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config,
-        write_config=write_config,
-        column_specs=specs)
-
-    # Write pipeline.
-    with self.assertRaises(Exception) as context:
-      with TestPipeline() as p:
-        _ = (p | beam.Create(test_chunks) | config.create_write_transform())
-
-    # Assert on what should happen.
-    self.assertIn(
-        "insert missed an field `id` to collection",
-        str(context.exception).lower())
-
-  def test_write_on_auto_id_primary_key(self):
-    auto_id_collection = f"auto_id_collection_{self._testMethodName}"
-    auto_id_partition = f"auto_id_partition_{self._testMethodName}"
-    auto_id_fields = [
-        FieldSchema(
-            name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
-        FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=1000),
-        FieldSchema(name="metadata", dtype=DataType.JSON),
-        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=3),
-        FieldSchema(
-            name="sparse_embedding", dtype=DataType.SPARSE_FLOAT_VECTOR)
-    ]
-
-    # Create collection with an auto id field.
-    create_collection_with_partition(
-        client=self._test_client,
-        collection_name=auto_id_collection,
-        partition_name=auto_id_partition,
-        fields=auto_id_fields)
-
-    test_chunks = [
-        Chunk(
-            id=1,
-            content=Content(text="Test content without ID"),
-            embedding=Embedding(
-                dense_embedding=[0.1, 0.2, 0.3],
-                sparse_embedding=([1, 2], [0.1, 0.2])),
-            metadata={"source": "test"})
-    ]
-
-    write_config = MilvusWriteConfig(
-        collection_name=auto_id_collection,
-        partition_name=auto_id_partition,
-        write_config=WriteConfig(write_batch_size=1))
-
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config, write_config=write_config)
-
-    with self.write_test_pipeline as p:
-      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
-
-    self._test_client.flush(auto_id_collection)
-    self._test_client.load_collection(auto_id_collection)
-    result = self._test_client.query(
-        collection_name=auto_id_collection,
-        partition_names=[auto_id_partition],
-        limit=3)
-
-    # Test there is only one item in the result and the ID is not equal to one.
-    self.assertEqual(len(result), len(test_chunks))
-    result_item = dict(result[0])
-    self.assertNotEqual(result_item["id"], 1)
-
-  def test_write_on_existent_collection_with_default_schema(self):
-    test_chunks = MILVUS_INGESTION_IT_CONFIG["corpus"]
-
-    write_config = MilvusWriteConfig(
-        collection_name=self._collection_name,
-        partition_name=self._partition_name,
-        write_config=WriteConfig(write_batch_size=3))
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config, write_config=write_config)
-
-    with self.write_test_pipeline as p:
-      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
-
-    # Verify data was written successfully.
-    self._test_client.flush(self._collection_name)
-    self._test_client.load_collection(self._collection_name)
-    result = self._test_client.query(
-        collection_name=self._collection_name,
-        partition_names=[self._partition_name],
-        limit=10)
-
-    self.assertEqual(len(result), len(test_chunks))
-
-    # Verify each chunk was written correctly.
-    result_by_id = {item["id"]: item for item in result}
-    for chunk in test_chunks:
-      self.assertIn(chunk.id, result_by_id)
-      result_item = result_by_id[chunk.id]
-      self.assertEqual(
-          result_item["content"],
-          chunk.content.text
-          if hasattr(chunk.content, 'text') else chunk.content)
-      self.assertEqual(result_item["metadata"], chunk.metadata)
-
-      # Verify embedding is present and has correct length.
-      expected_embedding = chunk.embedding.dense_embedding
-      actual_embedding = result_item["embedding"]
-      self.assertIsNotNone(actual_embedding)
-      self.assertEqual(len(actual_embedding), len(expected_embedding))
-
-  def test_write_with_custom_column_specifications(self):
-    from apache_beam.ml.rag.ingestion.postgres_common import ColumnSpec
-    from apache_beam.ml.rag.utils import MilvusHelpers
-
-    custom_column_specs = [
-        ColumnSpec("id", int, lambda chunk: int(chunk.id) if chunk.id else 0),
-        ColumnSpec(
-            "content",
-            str, lambda chunk: (
-                chunk.content.text
-                if hasattr(chunk.content, 'text') else chunk.content)),
-        ColumnSpec("metadata", dict, lambda chunk: chunk.metadata or {}),
-        ColumnSpec(
-            "embedding",
-            list, lambda chunk: chunk.embedding.dense_embedding or []),
-        ColumnSpec(
-            "sparse_embedding",
-            dict, lambda chunk: (
-                MilvusHelpers.sparse_embedding(
-                    chunk.embedding.sparse_embedding) if chunk.embedding and
-                chunk.embedding.sparse_embedding else {}))
-    ]
-
-    test_chunks = [
-        Chunk(
-            id=10,
-            content=Content(text="Custom column spec test"),
-            embedding=Embedding(
-                dense_embedding=[0.8, 0.9, 1.0],
-                sparse_embedding=([1, 3, 5], [0.8, 0.9, 1.0])),
-            metadata={"custom": "spec_test"})
-    ]
-
-    write_config = MilvusWriteConfig(
-        collection_name=self._collection_name,
-        partition_name=self._partition_name,
-        write_config=WriteConfig(write_batch_size=1))
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config,
-        write_config=write_config,
-        column_specs=custom_column_specs)
-
-    with self.write_test_pipeline as p:
-      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
-
-    # Verify data was written successfully.
-    self._test_client.flush(self._collection_name)
-    self._test_client.load_collection(self._collection_name)
-    result = self._test_client.query(
-        collection_name=self._collection_name,
-        partition_names=[self._partition_name],
-        filter="id == 10",
-        limit=1)
-
-    self.assertEqual(len(result), 1)
-    result_item = result[0]
-
-    # Verify custom column specs worked correctly.
-    self.assertEqual(result_item["id"], 10)
-    self.assertEqual(result_item["content"], "Custom column spec test")
-    self.assertEqual(result_item["metadata"], {"custom": "spec_test"})
-
-    # Verify embedding is present and has correct length.
-    expected_embedding = [0.8, 0.9, 1.0]
-    actual_embedding = result_item["embedding"]
-    self.assertIsNotNone(actual_embedding)
-    self.assertEqual(len(actual_embedding), len(expected_embedding))
-
-    # Verify sparse embedding was converted correctly - check keys are present.
-    expected_sparse_keys = {1, 3, 5}
-    actual_sparse = result_item["sparse_embedding"]
-    self.assertIsNotNone(actual_sparse)
-    self.assertEqual(set(actual_sparse.keys()), expected_sparse_keys)
-
-  def test_write_with_batching(self):
-    test_chunks = [
-        Chunk(
-            id=i,
-            content=Content(text=f"Batch test document {i}"),
-            embedding=Embedding(
-                dense_embedding=[0.1 * i, 0.2 * i, 0.3 * i],
-                sparse_embedding=([i, i + 1], [0.1 * i, 0.2 * i])),
-            metadata={"batch_id": i}) for i in range(1, 8)  # 7 chunks
-    ]
-
-    # Set small batch size to force batching (7 chunks with batch size 3).
-    batch_write_config = WriteConfig(write_batch_size=3)
-    write_config = MilvusWriteConfig(
-        collection_name=self._collection_name,
-        partition_name=self._partition_name,
-        write_config=batch_write_config)
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config, write_config=write_config)
-
-    with self.write_test_pipeline as p:
-      _ = (p | beam.Create(test_chunks) | config.create_write_transform())
-
-    # Verify all data was written successfully.
-    # Flush to persist all data to disk, then load collection for querying.
-    self._test_client.flush(self._collection_name)
-    self._test_client.load_collection(self._collection_name)
-
-    result = self._test_client.query(
-        collection_name=self._collection_name,
-        partition_names=[self._partition_name],
-        limit=10)
-
-    self.assertEqual(len(result), len(test_chunks))
-
-    # Verify each batch was written correctly.
-    result_by_id = {item["id"]: item for item in result}
-    for chunk in test_chunks:
-      self.assertIn(chunk.id, result_by_id)
-      result_item = result_by_id[chunk.id]
-
-      # Verify content and metadata.
-      self.assertEqual(result_item["content"], chunk.content.text)
-      self.assertEqual(result_item["metadata"], chunk.metadata)
-
-      # Verify embeddings are present and have correct length.
-      expected_embedding = chunk.embedding.dense_embedding
-      actual_embedding = result_item["embedding"]
-      self.assertIsNotNone(actual_embedding)
-      self.assertEqual(len(actual_embedding), len(expected_embedding))
-
-      # Verify sparse embedding keys are present.
-      expected_sparse_keys = {chunk.id, chunk.id + 1}
-      actual_sparse = result_item["sparse_embedding"]
-      self.assertIsNotNone(actual_sparse)
-      self.assertEqual(set(actual_sparse.keys()), expected_sparse_keys)
-
-  def test_idempotent_write(self):
-    # Step 1: Insert initial data that doesn't exist.
-    initial_chunks = [
-        Chunk(
-            id=100,
-            content=Content(text="Initial document"),
-            embedding=Embedding(
-                dense_embedding=[1.0, 2.0, 3.0],
-                sparse_embedding=([100, 101], [1.0, 2.0])),
-            metadata={"version": 1}),
-        Chunk(
-            id=200,
-            content=Content(text="Another initial document"),
-            embedding=Embedding(
-                dense_embedding=[2.0, 3.0, 4.0],
-                sparse_embedding=([200, 201], [2.0, 3.0])),
-            metadata={"version": 1})
-    ]
-
-    write_config = MilvusWriteConfig(
-        collection_name=self._collection_name,
-        partition_name=self._partition_name,
-        write_config=WriteConfig(write_batch_size=2))
-    config = MilvusVectorWriterConfig(
-        connection_params=self._connection_config, write_config=write_config)
-
-    # Insert initial data.
-    with TestPipeline() as p:
-      p.not_use_test_runner_api = True
-      _ = (
-          p | "Create initial" >> beam.Create(initial_chunks)
-          | "Write initial" >> config.create_write_transform())
-
-    # Verify initial data was inserted (not existed before).
-    self._test_client.flush(self._collection_name)
-    self._test_client.load_collection(self._collection_name)
-    result = self._test_client.query(
-        collection_name=self._collection_name,
-        partition_names=[self._partition_name],
-        limit=10)
-
-    self.assertEqual(len(result), 2)
-    result_by_id = {item["id"]: item for item in result}
-
-    # Verify initial state.
-    self.assertEqual(result_by_id[100]["content"], "Initial document")
-    self.assertEqual(result_by_id[100]["metadata"], {"version": 1})
-    self.assertEqual(result_by_id[200]["content"], "Another initial document")
-    self.assertEqual(result_by_id[200]["metadata"], {"version": 1})
-
-    # Step 2: Update existing data (same IDs, different content).
-    updated_chunks = [
-        Chunk(
-            id=100,
-            content=Content(text="Updated document"),
-            embedding=Embedding(
-                dense_embedding=[1.1, 2.1, 3.1],
-                sparse_embedding=([100, 102], [1.1, 2.1])),
-            metadata={"version": 2}),
-        Chunk(
-            id=200,
-            content=Content(text="Another updated document"),
-            embedding=Embedding(
-                dense_embedding=[2.1, 3.1, 4.1],
-                sparse_embedding=([200, 202], [2.1, 3.1])),
-            metadata={"version": 2})
-    ]
-
-    # Perform first update.
-    with TestPipeline() as p:
-      p.not_use_test_runner_api = True
-      _ = (
-          p | "Create update1" >> beam.Create(updated_chunks)
-          | "Write update1" >> config.create_write_transform())
-
-    # Verify update worked.
-    self._test_client.flush(self._collection_name)
-    self._test_client.load_collection(self._collection_name)
-    result = self._test_client.query(
-        collection_name=self._collection_name,
-        partition_names=[self._partition_name],
-        limit=10)
-
-    self.assertEqual(len(result), 2)  # Still only 2 records.
-    result_by_id = {item["id"]: item for item in result}
-
-    # Verify updated state.
-    self.assertEqual(result_by_id[100]["content"], "Updated document")
-    self.assertEqual(result_by_id[100]["metadata"], {"version": 2})
-    self.assertEqual(result_by_id[200]["content"], "Another updated document")
-    self.assertEqual(result_by_id[200]["metadata"], {"version": 2})
-
-    # Step 3: Repeat the same update operation 3 more times (idempotence test).
-    for i in range(3):
-      with TestPipeline() as p:
-        p.not_use_test_runner_api = True
-        _ = (
-            p | f"Create repeat{i+2}" >> beam.Create(updated_chunks)
-            | f"Write repeat{i+2}" >> config.create_write_transform())
-
-      # Verify state hasn't changed after repeated updates.
-      self._test_client.flush(self._collection_name)
-      self._test_client.load_collection(self._collection_name)
-      result = self._test_client.query(
-          collection_name=self._collection_name,
-          partition_names=[self._partition_name],
-          limit=10)
-
-      # Still only 2 records.
-      self.assertEqual(len(result), 2)
-      result_by_id = {item["id"]: item for item in result}
-
-      # Final state should remain unchanged.
-      self.assertEqual(result_by_id[100]["content"], "Updated document")
-      self.assertEqual(result_by_id[100]["metadata"], {"version": 2})
-      self.assertEqual(result_by_id[200]["content"], "Another updated document")
-      self.assertEqual(result_by_id[200]["metadata"], {"version": 2})
-
-      # Verify embeddings are still correct.
-      self.assertIsNotNone(result_by_id[100]["embedding"])
-      self.assertEqual(len(result_by_id[100]["embedding"]), 3)
-      self.assertIsNotNone(result_by_id[200]["embedding"])
-      self.assertEqual(len(result_by_id[200]["embedding"]), 3)
-
-
-if __name__ == '__main__':
-  unittest.main()
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py b/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
deleted file mode 100644
index ea80f2a8afcb..000000000000
--- a/sdks/python/apache_beam/ml/rag/ingestion/milvus_search_test.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import unittest
-
-from parameterized import parameterized
-
-try:
-  from apache_beam.ml.rag.ingestion.milvus_search import (
-      MilvusWriteConfig, MilvusVectorWriterConfig)
-  from apache_beam.ml.rag.utils import MilvusConnectionParameters
-except ImportError as e:
-  raise unittest.SkipTest(f'Milvus dependencies not installed: {str(e)}')
-
-
-class TestMilvusWriteConfig(unittest.TestCase):
-  """Unit tests for MilvusWriteConfig validation errors."""
-  def test_empty_collection_name_raises_error(self):
-    """Test that empty collection name raises ValueError."""
-    with self.assertRaises(ValueError) as context:
-      MilvusWriteConfig(collection_name="")
-
-    self.assertIn("Collection name must be provided", str(context.exception))
-
-  def test_none_collection_name_raises_error(self):
-    """Test that None collection name raises ValueError."""
-    with self.assertRaises(ValueError) as context:
-      MilvusWriteConfig(collection_name=None)
-
-    self.assertIn("Collection name must be provided", str(context.exception))
-
-
-class TestMilvusVectorWriterConfig(unittest.TestCase):
-  """Unit tests for MilvusVectorWriterConfig validation and functionality."""
-  def test_valid_config_creation(self):
-    """Test creation of valid MilvusVectorWriterConfig."""
-    connection_params = MilvusConnectionParameters(uri="http://localhost:19530")
-    write_config = MilvusWriteConfig(collection_name="test_collection")
-
-    config = MilvusVectorWriterConfig(
-        connection_params=connection_params, write_config=write_config)
-
-    self.assertEqual(config.connection_params, connection_params)
-    self.assertEqual(config.write_config, write_config)
-    self.assertIsNotNone(config.column_specs)
-
-  def test_create_converter_returns_callable(self):
-    """Test that create_converter returns a callable function."""
-    connection_params = MilvusConnectionParameters(uri="http://localhost:19530")
-    write_config = MilvusWriteConfig(collection_name="test_collection")
-
-    config = MilvusVectorWriterConfig(
-        connection_params=connection_params, write_config=write_config)
-
-    converter = config.create_converter()
-    self.assertTrue(callable(converter))
-
-  def test_create_write_transform_returns_ptransform(self):
-    """Test that create_write_transform returns a PTransform."""
-    connection_params = MilvusConnectionParameters(uri="http://localhost:19530")
-    write_config = MilvusWriteConfig(collection_name="test_collection")
-
-    config = MilvusVectorWriterConfig(
-        connection_params=connection_params, write_config=write_config)
-
-    transform = config.create_write_transform()
-    self.assertIsNotNone(transform)
-
-  def test_default_column_specs_has_expected_fields(self):
-    """Test that default column specs include expected fields."""
-    column_specs = MilvusVectorWriterConfig.default_column_specs()
-
-    self.assertIsInstance(column_specs, list)
-    self.assertGreater(len(column_specs), 0)
-
-    column_names = [spec.column_name for spec in column_specs]
-    expected_fields = [
-        "id", "embedding", "sparse_embedding", "content", "metadata"
-    ]
-
-    for field in expected_fields:
-      self.assertIn(field, column_names)
-
-  @parameterized.expand([
-      # Invalid connection parameters - empty URI.
-      (
-          lambda: (
-              MilvusConnectionParameters(uri=""), MilvusWriteConfig(
-                  collection_name="test_collection")),
-          "URI must be provided"),
-      # Invalid write config - empty collection name.
-      (
-          lambda: (
-              MilvusConnectionParameters(uri="http://localhost:19530"),
-              MilvusWriteConfig(collection_name="")),
-          "Collection name must be provided"),
-  ])
-  def test_invalid_configuration_parameters(
-      self, create_params, expected_error_msg):
-    """Test validation errors for invalid configuration parameters."""
-    with self.assertRaises(ValueError) as context:
-      connection_params, write_config = create_params()
-      MilvusVectorWriterConfig(
-          connection_params=connection_params, write_config=write_config)
-
-    self.assertIn(expected_error_msg, str(context.exception))
-
-
-if __name__ == '__main__':
-  unittest.main()

From 2ba2b3329fdccc09e4a28d91e98bae0ece651b34 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Wed, 5 Nov 2025 06:20:58 +0000
Subject: [PATCH 23/35] CHANGES.md: remove milvus from release notes in the
 refactoring PR

---
 CHANGES.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 1ce9016f4cf1..2ee557b8fef3 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -75,9 +75,6 @@
 * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
 * Python examples added for Milvus search enrichment handler on [Beam Website](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-milvus/)
   including jupyter notebook example (Python) ([#36176](https://github.com/apache/beam/issues/36176)).
-* Milvus sink I/O connector added (Python) ([#36702](
-  https://github.com/apache/beam/issues/36702)). Now Beam has full support for
-  Milvus integration including Milvus enrichment and sink operations.
 
 ## Breaking Changes
 

From 894ab28d6319f0c98274609b0c1613ad0e0cc9c2 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Wed, 5 Nov 2025 06:45:30 +0000
Subject: [PATCH 24/35] sdks/python: remove `with_sparse_embedding_spec` column
 specs builder

In this commit, we remove that builder method to remain functional
and be used in the next Milvus sink integration PR
---
 .../ml/rag/ingestion/postgres_common.py       | 36 -------------------
 1 file changed, 36 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
index 93968564f156..4fbe08205543 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
@@ -312,42 +312,6 @@ def value_fn(chunk: Chunk) -> Any:
         ColumnSpec.vector(column_name=column_name, value_fn=value_fn))
     return self
 
-  def with_sparse_embedding_spec(
-      self,
-      column_name: str = "sparse_embedding",
-      conv_fn: Optional[Callable[[Tuple[List[int], List[float]]], Any]] = None
-  ) -> 'ColumnSpecsBuilder':
-    """Add sparse embedding :class:`.ColumnSpec` with optional conversion.
-
-      Args:
-          column_name: Name for the sparse embedding column
-            (defaults to "sparse_embedding")
-          conv_fn: Optional function to convert the sparse embedding tuple
-                      If None, converts to PostgreSQL-compatible JSON format
-
-      Returns:
-          Self for method chaining
-
-      Example:
-          >>> builder.with_sparse_embedding_spec(
-          ...     column_name="sparse_vector",
-          ...     convert_fn=lambda sparse: dict(zip(sparse[0], sparse[1]))
-          ... )
-      """
-    def value_fn(chunk: Chunk) -> Any:
-      if chunk.embedding is None or chunk.embedding.sparse_embedding is None:
-        raise ValueError(f'Expected chunk to contain sparse embedding. {chunk}')
-      sparse_embedding = chunk.embedding.sparse_embedding
-      if conv_fn:
-        return conv_fn(sparse_embedding)
-      # Default: convert to dict format for JSON storage.
-      indices, values = sparse_embedding
-      return json.dumps(dict(zip(indices, values)))
-
-    self._specs.append(
-        ColumnSpec.jsonb(column_name=column_name, value_fn=value_fn))
-    return self
-
   def add_metadata_field(
       self,
       field: str,

From 21ce0840a93cf24409cb6a1b9e7e4e3e70174339 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Wed, 5 Nov 2025 09:59:46 +0000
Subject: [PATCH 25/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
index 4fbe08205543..68afa56e399e 100644
--- a/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
+++ b/sdks/python/apache_beam/ml/rag/ingestion/postgres_common.py
@@ -22,7 +22,6 @@
 from typing import List
 from typing import Literal
 from typing import Optional
-from typing import Tuple
 from typing import Type
 from typing import Union
 

From ffa5d2b4f3aa6ca0ab926afdfac95ef23c5e9edc Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 07:59:01 +0000
Subject: [PATCH 26/35] Revert "notebooks/beam-ml: use new refactored code in
 milvus enrichment handler"

This reverts commit 461c8fee9d1d4b63b63558d188f88f3e79856309.
---
 .../beam-ml/milvus_enrichment_transform.ipynb | 338 +++++-------------
 1 file changed, 95 insertions(+), 243 deletions(-)

diff --git a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb
index 113038e56984..2dbd038f3086 100644
--- a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb
+++ b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
    "id": "47053bac",
    "metadata": {},
    "outputs": [],
@@ -67,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 2,
    "id": "e550cd55-e91e-4d43-b1bd-b0e89bb8cbd9",
    "metadata": {},
    "outputs": [],
@@ -80,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 3,
    "id": "31747c45-107a-49be-8885-5a6cc9dc1236",
    "metadata": {},
    "outputs": [
@@ -88,12 +88,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
-      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+      "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
      ]
     }
    ],
@@ -106,10 +103,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 4,
    "id": "666e0c2b-0341-4b0e-8d73-561abc39bb10",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/dev/beam/sdks/python/.venv/lib/python3.9/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'validate_default' attribute with value True was provided to the `Field()` function, which has no effect in the context it was used. 'validate_default' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "# Standard library imports\n",
     "from collections import defaultdict\n",
@@ -143,13 +149,13 @@
     "from apache_beam.ml.rag.types import Chunk, Content, Embedding\n",
     "from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider\n",
     "from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings\n",
-    "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusTestHelpers\n",
-    "from apache_beam.ml.rag.utils import MilvusConnectionParameters\n",
+    "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusEnrichmentTestHelper\n",
     "from apache_beam.ml.rag.enrichment.milvus_search import (\n",
     "    HybridSearchParameters, \n",
     "    KeywordSearchMetrics, \n",
     "    KeywordSearchParameters,\n",
     "    MilvusCollectionLoadParameters, \n",
+    "    MilvusConnectionParameters, \n",
     "    MilvusSearchEnrichmentHandler,\n",
     "    MilvusSearchParameters, \n",
     "    SearchStrategy, \n",
@@ -188,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 5,
    "id": "38781cf5-e18f-40f5-827e-2d441ae7d2fa",
    "metadata": {},
    "outputs": [],
@@ -281,7 +287,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 6,
    "id": "489e93b6-de41-4ec3-be33-a15c3cba12e8",
    "metadata": {},
    "outputs": [
@@ -358,7 +364,7 @@
        "max    312.000000"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -373,7 +379,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 7,
    "id": "eb32aad0-febd-45af-b4bd-e2176b07e2dc",
    "metadata": {},
    "outputs": [
@@ -418,7 +424,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 8,
    "id": "5ae9bc82-9ad7-46dd-b254-19cbdcdd0e07",
    "metadata": {},
    "outputs": [],
@@ -429,30 +435,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 9,
    "id": "aff7b261-3330-4fa9-9a54-3fd87b42521f",
    "metadata": {},
    "outputs": [],
    "source": [
     "if db:\n",
     "    # Stop existing Milvus DB container to prevent duplicates.\n",
-    "    MilvusTestHelpers.stop_db_container(db)\n",
-    "db = MilvusTestHelpers.start_db_container(milvus_version)"
+    "    MilvusEnrichmentTestHelper.stop_db_container(db)\n",
+    "db = MilvusEnrichmentTestHelper.start_db_container(milvus_version)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 10,
    "id": "31496ee0-75a2-48ad-954e-9c4ae5abbf5e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "milvus_connection_parameters = MilvusConnectionParameters(uri=db.uri, user=db.user, password=db.password, db_name=db.id)"
+    "milvus_connection_parameters = MilvusConnectionParameters(uri=db.uri, user=db.user, password=db.password, db_id=db.id)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 11,
    "id": "82627714-2425-4058-9b47-d262f015caf7",
    "metadata": {},
    "outputs": [],
@@ -462,7 +468,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 12,
    "id": "e8a85f51-5d5f-4533-bf0f-ec825e613dc2",
    "metadata": {},
    "outputs": [
@@ -472,7 +478,7 @@
        "'2.5.10'"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -499,7 +505,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 13,
    "id": "e3847821-069c-412f-8c20-2406bcac1e55",
    "metadata": {},
    "outputs": [],
@@ -514,7 +520,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 14,
    "id": "c014af94-1bb7-44e4-842c-1039f4a2a11d",
    "metadata": {},
    "outputs": [],
@@ -539,7 +545,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 15,
    "id": "54fb3428-b007-4804-9d79-b3933d3256c5",
    "metadata": {},
    "outputs": [],
@@ -555,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 16,
    "id": "4c2f123a-5949-4974-af48-a5db5b168c11",
    "metadata": {},
    "outputs": [
@@ -565,7 +571,7 @@
        "{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}, {'name': 'content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65279}}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'sparse_embedding', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}, {'name': 'metadata', 'description': '', 'type': <DataType.JSON: 23>}, {'name': 'title_and_content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535, 'enable_analyzer': True}}], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['title_and_content'], 'output_field_names': ['sparse_embedding'], 'params': {}}]}"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -585,7 +591,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 17,
    "id": "671f4352-2086-4428-83be-0de48926682d",
    "metadata": {},
    "outputs": [],
@@ -603,7 +609,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 18,
    "id": "aa8baae5-7c38-4e78-ace4-304c7dc6b127",
    "metadata": {},
    "outputs": [],
@@ -626,7 +632,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 19,
    "id": "d970a35b-f9b2-4f8f-93ef-8de5c83c31b5",
    "metadata": {},
    "outputs": [],
@@ -641,7 +647,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 20,
    "id": "0d45a6ad-2009-4e30-b38d-73266da98a06",
    "metadata": {},
    "outputs": [
@@ -652,7 +658,7 @@
        " {'field_name': 'sparse_embedding', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -671,7 +677,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 21,
    "id": "51dd4423-240c-4271-bb8c-6270f399a25c",
    "metadata": {},
    "outputs": [],
@@ -681,7 +687,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 22,
    "id": "9620b1f2-51fa-491c-ad3f-f0676b9b25f6",
    "metadata": {},
    "outputs": [],
@@ -691,7 +697,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 23,
    "id": "e6cf3a1d-265c-44db-aba8-d491fab290d5",
    "metadata": {},
    "outputs": [],
@@ -701,7 +707,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 24,
    "id": "94497411-43d3-4300-98b3-1cb33759738e",
    "metadata": {},
    "outputs": [
@@ -711,7 +717,7 @@
        "True"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -730,7 +736,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 25,
    "id": "25c5c202-abe0-4d11-82df-e731f0d6201e",
    "metadata": {
     "scrolled": true
@@ -777,160 +783,6 @@
       "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n"
      ]
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fb92c794ace141d6a6673d8cb5cffc54",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "69b2041978344ba9ae81a0dd25ff8026",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ef680b99b80f4d9cabc07fd5859da49a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "README.md: 0.00B [00:00, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0af1d00432d64f54a3e099f1748236e7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "743a8a89e2884054b5e27f5a853796c3",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f1984aab1be345a1b790c6d21914f089",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0bad3ae7da7d4730b54f202689653cd6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0c1db6d4674c40018a055afe1c62fc4a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "vocab.txt: 0.00B [00:00, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cf985842345a41d1b799e159753ea151",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "tokenizer.json: 0.00B [00:00, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0e2587ee08884756b1e57328a3c08099",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c8af2abd4dce47c2837edd83631ca7d0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -943,7 +795,7 @@
       "text/html": [
        "\n",
        "            <link rel=\"stylesheet\" href=\"https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css\" integrity=\"sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh\" crossorigin=\"anonymous\">\n",
-       "            <div id=\"progress_indicator_c9bffbfccdefa2af8f6baaa176074c9e\">\n",
+       "            <div id=\"progress_indicator_ef090119901644a31067b90f8d98d385\">\n",
        "              <div class=\"spinner-border text-info\" role=\"status\"></div>\n",
        "              <span class=\"text-info\">Processing... show</span>\n",
        "            </div>\n",
@@ -978,7 +830,7 @@
        "            }\n",
        "            </style>\n",
        "            <link rel=\"stylesheet\" href=\"https://cdn.datatables.net/1.10.20/css/jquery.dataTables.min.css\">\n",
-       "            <table id=\"table_df_1f485acbe7f51ee8188432042c1136e9\" class=\"display\" style=\"display:block\"></table>\n",
+       "            <table id=\"table_df_08499c8cd95657156c076a29cd68a254\" class=\"display\" style=\"display:block\"></table>\n",
        "            <script>\n",
        "              \n",
        "        if (typeof window.interactive_beam_jquery == 'undefined') {\n",
@@ -994,10 +846,10 @@
        "              window.interactive_beam_jquery(document).ready(function($){\n",
        "                \n",
        "            var dt;\n",
-       "            if ($.fn.dataTable.isDataTable(\"#table_df_1f485acbe7f51ee8188432042c1136e9\")) {\n",
-       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable();\n",
-       "            } else if ($(\"#table_df_1f485acbe7f51ee8188432042c1136e9_wrapper\").length == 0) {\n",
-       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable({\n",
+       "            if ($.fn.dataTable.isDataTable(\"#table_df_08499c8cd95657156c076a29cd68a254\")) {\n",
+       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable();\n",
+       "            } else if ($(\"#table_df_08499c8cd95657156c076a29cd68a254_wrapper\").length == 0) {\n",
+       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable({\n",
        "                \n",
        "            bAutoWidth: false,\n",
        "            columns: [{'title': ''}, {'title': 'id'}, {'title': 'content'}, {'title': 'title_and_content'}, {'title': 'metadata'}, {'title': 'embedding'}],\n",
@@ -1031,10 +883,10 @@
        "          window.interactive_beam_jquery(document).ready(function($){\n",
        "            \n",
        "            var dt;\n",
-       "            if ($.fn.dataTable.isDataTable(\"#table_df_1f485acbe7f51ee8188432042c1136e9\")) {\n",
-       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable();\n",
-       "            } else if ($(\"#table_df_1f485acbe7f51ee8188432042c1136e9_wrapper\").length == 0) {\n",
-       "              dt = $(\"#table_df_1f485acbe7f51ee8188432042c1136e9\").dataTable({\n",
+       "            if ($.fn.dataTable.isDataTable(\"#table_df_08499c8cd95657156c076a29cd68a254\")) {\n",
+       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable();\n",
+       "            } else if ($(\"#table_df_08499c8cd95657156c076a29cd68a254_wrapper\").length == 0) {\n",
+       "              dt = $(\"#table_df_08499c8cd95657156c076a29cd68a254\").dataTable({\n",
        "                \n",
        "            bAutoWidth: false,\n",
        "            columns: [{'title': ''}, {'title': 'id'}, {'title': 'content'}, {'title': 'title_and_content'}, {'title': 'metadata'}, {'title': 'embedding'}],\n",
@@ -1086,7 +938,7 @@
        "              window.interactive_beam_jquery = jQuery.noConflict(true);\n",
        "              window.interactive_beam_jquery(document).ready(function($){\n",
        "                \n",
-       "            $(\"#progress_indicator_c9bffbfccdefa2af8f6baaa176074c9e\").remove();\n",
+       "            $(\"#progress_indicator_ef090119901644a31067b90f8d98d385\").remove();\n",
        "              });\n",
        "            }\n",
        "            document.head.appendChild(datatableScript);\n",
@@ -1095,7 +947,7 @@
        "        } else {\n",
        "          window.interactive_beam_jquery(document).ready(function($){\n",
        "            \n",
-       "            $(\"#progress_indicator_c9bffbfccdefa2af8f6baaa176074c9e\").remove();\n",
+       "            $(\"#progress_indicator_ef090119901644a31067b90f8d98d385\").remove();\n",
        "          });\n",
        "        }"
       ]
@@ -1208,7 +1060,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 26,
    "id": "4911e8cc-10f1-4d21-9251-1b756b61f2c1",
    "metadata": {},
    "outputs": [],
@@ -1282,7 +1134,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 27,
    "id": "74db1238-0a04-4e08-818d-5bce8f09006b",
    "metadata": {},
    "outputs": [],
@@ -1293,7 +1145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 28,
    "id": "79e16531-8bec-4b4b-9ed3-cebd705480e0",
    "metadata": {},
    "outputs": [],
@@ -1306,7 +1158,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 29,
    "id": "cbef1911-6464-4ba1-8974-ed00896c7e8b",
    "metadata": {},
    "outputs": [],
@@ -1316,7 +1168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 30,
    "id": "f0481286-3f2b-4690-a2f6-a5a00de3ff34",
    "metadata": {},
    "outputs": [],
@@ -1329,7 +1181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 31,
    "id": "35ee37f2-60cd-4d5d-aef6-aed4fda79161",
    "metadata": {},
    "outputs": [
@@ -1642,7 +1494,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 32,
    "id": "f159ad87-5153-48bb-87b3-3845d3c76420",
    "metadata": {},
    "outputs": [],
@@ -1653,7 +1505,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 33,
    "id": "8b8cad3e-8a18-464b-8de6-aa4515a653c5",
    "metadata": {},
    "outputs": [],
@@ -1666,7 +1518,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 34,
    "id": "47cfc650-0b34-4333-9321-19be2e8fdc85",
    "metadata": {},
    "outputs": [],
@@ -1676,7 +1528,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 35,
    "id": "4754763b-66bf-4f90-9920-28cef223b536",
    "metadata": {},
    "outputs": [],
@@ -1689,7 +1541,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 36,
    "id": "a3db4837-01c7-42d7-b4e8-58d8d361fe93",
    "metadata": {},
    "outputs": [
@@ -1811,7 +1663,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 37,
    "id": "172b6c80-2a03-49d0-afc7-12bb0a4dc989",
    "metadata": {},
    "outputs": [],
@@ -1822,7 +1674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 38,
    "id": "eb6d951c-0def-45cc-84a4-b6f7b7575f23",
    "metadata": {},
    "outputs": [],
@@ -1836,7 +1688,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 39,
    "id": "b339c498-d229-42e6-b439-b29eb107b533",
    "metadata": {},
    "outputs": [],
@@ -1849,7 +1701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 40,
    "id": "b346abe6-03c9-4b28-a0fb-74936b9f3a06",
    "metadata": {},
    "outputs": [],
@@ -1859,7 +1711,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 41,
    "id": "ab27810d-40a8-4b6a-bc82-441e13763ebc",
    "metadata": {},
    "outputs": [],
@@ -1872,7 +1724,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 42,
    "id": "ea9d84f7-d142-4afa-9a6f-6c310d9604b0",
    "metadata": {},
    "outputs": [
@@ -2052,7 +1904,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 43,
    "id": "3d267853-649d-494f-bea6-bbfe20650f79",
    "metadata": {},
    "outputs": [],
@@ -2063,7 +1915,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 44,
    "id": "28a45b1c-f9a5-452e-aea6-ac46f17e01bd",
    "metadata": {},
    "outputs": [],
@@ -2079,7 +1931,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 45,
    "id": "9ce3f0c7-fd1d-49a1-81e9-b8153cd284ea",
    "metadata": {},
    "outputs": [],
@@ -2089,7 +1941,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 46,
    "id": "6fad29b5-c2b0-4458-ab83-b38eb15a7505",
    "metadata": {},
    "outputs": [],
@@ -2102,7 +1954,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 47,
    "id": "77add8a8-ddb8-48de-b1af-632d78c0d112",
    "metadata": {},
    "outputs": [
@@ -2407,7 +2259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 48,
    "id": "6e79ef5c-a121-4e69-9089-0991821f8745",
    "metadata": {},
    "outputs": [],
@@ -2418,7 +2270,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 49,
    "id": "5314c531-14bb-4d81-92a5-fcf9cca7fa81",
    "metadata": {},
    "outputs": [],
@@ -2435,7 +2287,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 50,
    "id": "0ecf2ac6-cf90-4ce7-b17f-113af90ab950",
    "metadata": {},
    "outputs": [],
@@ -2445,7 +2297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 51,
    "id": "0cd92b69-b9dc-445c-9bd7-21bb3ceb0fd3",
    "metadata": {},
    "outputs": [],
@@ -2458,7 +2310,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 52,
    "id": "b06ecf64-c314-4c6a-ae1a-4fdf059aeead",
    "metadata": {},
    "outputs": [
@@ -2619,7 +2471,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 53,
    "id": "a8077395-c374-400f-abdc-fe6630eab8a4",
    "metadata": {},
    "outputs": [],
@@ -2630,7 +2482,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 54,
    "id": "3b712779-f283-4e37-88ed-d6b65c6c45d2",
    "metadata": {},
    "outputs": [],
@@ -2643,7 +2495,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 55,
    "id": "7f0924a3-8832-4138-a599-d3aef648b962",
    "metadata": {},
    "outputs": [],
@@ -2653,7 +2505,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 56,
    "id": "516ecbf0-9bb0-4177-829b-b79300b29bbe",
    "metadata": {},
    "outputs": [],
@@ -2666,7 +2518,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 57,
    "id": "db32dda5-0668-4162-80ea-b6a0c2a79063",
    "metadata": {},
    "outputs": [
@@ -2771,12 +2623,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 58,
    "id": "0a3f4d66-3823-46c7-8a58-e9e8ac7899c8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "MilvusTestHelpers.stop_db_container(db)\n",
+    "MilvusEnrichmentTestHelper.stop_db_container(db)\n",
     "db = None"
    ]
   }
@@ -2797,7 +2649,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.19"
+   "version": "3.9.24"
   }
  },
  "nbformat": 4,

From 0c00044c07c3fd4d5600aa66a5e545b95058c2c8 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 10:43:23 +0000
Subject: [PATCH 27/35] sdks/python: fix linting issues

---
 .../transforms/elementwise/enrichment_test.py | 19 ++--
 .../rag/enrichment/milvus_search_it_test.py   | 90 ++-----------------
 sdks/python/apache_beam/ml/rag/test_utils.py  | 79 ++++++++++++++++
 3 files changed, 93 insertions(+), 95 deletions(-)

diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
index 2452b748c83c..ed2b0c131e0c 100644
--- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
+++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
@@ -52,10 +52,9 @@
       ConnectionConfig,
       CloudSQLConnectionConfig,
       ExternalSQLDBConnectionConfig)
-  from apache_beam.ml.rag.enrichment.milvus_search import (
-      MilvusConnectionParameters)
-  from apache_beam.ml.rag.enrichment.milvus_search_it_test import (
-      MilvusEnrichmentTestHelper, MilvusDBContainerInfo)
+  from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
+  from apache_beam.ml.rag.test_utils import MilvusTestHelpers
+  from apache_beam.ml.rag.test_utils import VectorDBContainerInfo
   from apache_beam.ml.rag.test_utils import MilvusTestHelpers
   from apache_beam.ml.rag.utils import parse_chunk_strings
   from apache_beam.io.requestresponse import RequestResponseIO
@@ -261,7 +260,7 @@ def sql_test_context(is_cloudsql: bool, db_adapter: DatabaseTypeAdapter):
   @staticmethod
   @contextmanager
   def milvus_test_context():
-    db: Optional[MilvusDBContainerInfo] = None
+    db: Optional[VectorDBContainerInfo] = None
     try:
       db = EnrichmentTestHelpers.pre_milvus_enrichment()
       yield
@@ -374,16 +373,16 @@ def post_sql_enrichment_test(res: CloudSQLEnrichmentTestDataConstruct):
       os.environ.pop('GOOGLE_CLOUD_SQL_DB_TABLE_ID', None)
 
   @staticmethod
-  def pre_milvus_enrichment() -> MilvusDBContainerInfo:
+  def pre_milvus_enrichment() -> VectorDBContainerInfo:
     try:
-      db = MilvusEnrichmentTestHelper.start_db_container()
+      db = MilvusTestHelpers.start_db_container()
       connection_params = MilvusConnectionParameters(
           uri=db.uri,
           user=db.user,
           password=db.password,
           db_id=db.id,
           token=db.token)
-      collection_name = MilvusEnrichmentTestHelper.initialize_db_with_data(
+      collection_name = MilvusTestHelpers.initialize_db_with_data(
           connection_params)
     except Exception as e:
       raise TestContainerStartupError(
@@ -401,9 +400,9 @@ def pre_milvus_enrichment() -> MilvusDBContainerInfo:
     return db
 
   @staticmethod
-  def post_milvus_enrichment(db: MilvusDBContainerInfo):
+  def post_milvus_enrichment(db: VectorDBContainerInfo):
     try:
-      MilvusEnrichmentTestHelper.stop_db_container(db)
+      MilvusTestHelpers.stop_db_container(db)
     except Exception as e:
       raise TestContainerTeardownError(
           f"Milvus container failed to tear down: {str(e)}")
diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index 76358f49050d..69a8df3026e9 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -20,10 +20,7 @@
 import unittest
 from dataclasses import dataclass
 from dataclasses import field
-from typing import Callable
 from typing import Dict
-from typing import List
-from typing import cast
 
 import pytest
 
@@ -31,25 +28,18 @@
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Content
 from apache_beam.ml.rag.types import Embedding
-from apache_beam.ml.rag.utils import retry_with_backoff
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.util import assert_that
 
 # pylint: disable=ungrouped-imports
 try:
-  from pymilvus import CollectionSchema
   from pymilvus import DataType
   from pymilvus import FieldSchema
   from pymilvus import Function
   from pymilvus import FunctionType
-  from pymilvus import MilvusClient
   from pymilvus import RRFRanker
-  from pymilvus.exceptions import MilvusException
   from pymilvus.milvus_client import IndexParams
 
-  from apache_beam.transforms.enrichment import Enrichment
-  from apache_beam.ml.rag.test_utils import MilvusTestHelpers
-  from apache_beam.ml.rag.test_utils import VectorDBContainerInfo
   from apache_beam.ml.rag.enrichment.milvus_search import HybridSearchParameters
   from apache_beam.ml.rag.enrichment.milvus_search import KeywordSearchMetrics
   from apache_beam.ml.rag.enrichment.milvus_search import KeywordSearchParameters
@@ -59,12 +49,12 @@
   from apache_beam.ml.rag.enrichment.milvus_search import MilvusSearchParameters
   from apache_beam.ml.rag.enrichment.milvus_search import VectorSearchMetrics
   from apache_beam.ml.rag.enrichment.milvus_search import VectorSearchParameters
+  from apache_beam.ml.rag.test_utils import MilvusTestHelpers
+  from apache_beam.ml.rag.test_utils import VectorDBContainerInfo
+  from apache_beam.transforms.enrichment import Enrichment
 except ImportError as e:
   raise unittest.SkipTest(f'Milvus dependencies not installed: {str(e)}')
 
-_LOGGER = logging.getLogger(__name__)
-
-
 def _construct_index_params():
   index_params = IndexParams()
 
@@ -235,77 +225,6 @@ def __getitem__(self, key):
 }
 
 
-def initialize_db_with_data(connc_params: MilvusConnectionParameters):
-  # Open the connection to the milvus db with retry.
-  def create_client():
-    return MilvusClient(**connc_params.__dict__)
-
-  client = retry_with_backoff(
-      create_client,
-      max_retries=3,
-      retry_delay=1.0,
-      operation_name="Test Milvus client connection",
-      exception_types=(MilvusException, ))
-
-  # Configure schema.
-  field_schemas: List[FieldSchema] = cast(
-      List[FieldSchema], MILVUS_IT_CONFIG["fields"])
-  schema = CollectionSchema(
-      fields=field_schemas, functions=MILVUS_IT_CONFIG["functions"])
-
-  # Create collection with the schema.
-  collection_name = MILVUS_IT_CONFIG["collection_name"]
-  index_function: Callable[[], IndexParams] = cast(
-      Callable[[], IndexParams], MILVUS_IT_CONFIG["index"])
-  client.create_collection(
-      collection_name=collection_name,
-      schema=schema,
-      index_params=index_function())
-
-  # Assert that collection was created.
-  collection_error = f"Expected collection '{collection_name}' to be created."
-  assert client.has_collection(collection_name), collection_error
-
-  # Gather all fields we have excluding 'sparse_embedding_bm25' special field.
-  fields = list(map(lambda field: field.name, field_schemas))
-
-  # Prep data for indexing. Currently we can't insert sparse vectors for BM25
-  # sparse embedding field as it would be automatically generated by Milvus
-  # through the registered BM25 function.
-  data_ready_to_index = []
-  for doc in MILVUS_IT_CONFIG["corpus"]:
-    item = {}
-    for field in fields:
-      if field.startswith("dense_embedding"):
-        item[field] = doc["dense_embedding"]
-      elif field == "sparse_embedding_inner_product":
-        item[field] = doc["sparse_embedding"]
-      elif field == "sparse_embedding_bm25":
-        # It is automatically generated by Milvus from the content field.
-        continue
-      else:
-        item[field] = doc[field]
-    data_ready_to_index.append(item)
-
-  # Index data.
-  result = client.insert(
-      collection_name=collection_name, data=data_ready_to_index)
-
-  # Assert that the intended data has been properly indexed.
-  insertion_err = f'failed to insert the {result["insert_count"]} data points'
-  assert result["insert_count"] == len(data_ready_to_index), insertion_err
-
-  # Release the collection from memory. It will be loaded lazily when the
-  # enrichment handler is invoked.
-  client.release_collection(collection_name)
-
-  # Close the connection to the Milvus database, as no further preparation
-  # operations are needed  before executing the enrichment handler.
-  client.close()
-
-  return collection_name
-
-
 @pytest.mark.require_docker_in_docker
 @unittest.skipUnless(
     platform.system() == "Linux",
@@ -329,7 +248,8 @@ def setUpClass(cls):
         db_name=cls._db.id,
         token=cls._db.token)
     cls._collection_load_params = MilvusCollectionLoadParameters()
-    cls._collection_name = initialize_db_with_data(cls._connection_params)
+    cls._collection_name = MilvusTestHelpers.initialize_db_with_data(
+        cls._connection_params, MILVUS_IT_CONFIG)
 
   @classmethod
   def tearDownClass(cls):
diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 99251d3878f2..4dec414070bb 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -21,15 +21,24 @@
 import socket
 import tempfile
 from dataclasses import dataclass
+from typing import Callable
 from typing import List
 from typing import Optional
+from typing import cast
 
 import yaml
 from testcontainers.core.config import testcontainers_config
 from testcontainers.core.generic import DbContainer
 from testcontainers.milvus import MilvusContainer
+from pymilvus import CollectionSchema
+from pymilvus import FieldSchema
+from pymilvus import MilvusClient
+from pymilvus.exceptions import MilvusException
+from pymilvus.milvus_client import IndexParams
 
 from apache_beam.ml.rag.types import Chunk
+from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
+from apache_beam.ml.rag.utils import retry_with_backoff
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -180,6 +189,76 @@ def stop_db_container(db_info: VectorDBContainerInfo):
     db_info.container.stop()
     _LOGGER.info("milvus db container stopped successfully.")
 
+  def initialize_db_with_data(
+      connc_params: MilvusConnectionParameters, config: dict):
+    # Open the connection to the milvus db with retry.
+    def create_client():
+      return MilvusClient(**connc_params.__dict__)
+
+    client = retry_with_backoff(
+        create_client,
+        max_retries=3,
+        retry_delay=1.0,
+        operation_name="Test Milvus client connection",
+        exception_types=(MilvusException, ))
+
+    # Configure schema.
+    field_schemas: List[FieldSchema] = cast(List[FieldSchema], config["fields"])
+    schema = CollectionSchema(
+        fields=field_schemas, functions=config["functions"])
+
+    # Create collection with the schema.
+    collection_name = config["collection_name"]
+    index_function: Callable[[], IndexParams] = cast(
+        Callable[[], IndexParams], config["index"])
+    client.create_collection(
+        collection_name=collection_name,
+        schema=schema,
+        index_params=index_function())
+
+    # Assert that collection was created.
+    collection_error = f"Expected collection '{collection_name}' to be created."
+    assert client.has_collection(collection_name), collection_error
+
+    # Gather all fields we have excluding 'sparse_embedding_bm25' special field.
+    fields = list(map(lambda field: field.name, field_schemas))
+
+    # Prep data for indexing. Currently we can't insert sparse vectors for BM25
+    # sparse embedding field as it would be automatically generated by Milvus
+    # through the registered BM25 function.
+    data_ready_to_index = []
+    for doc in config["corpus"]:
+      item = {}
+      for field in fields:
+        if field.startswith("dense_embedding"):
+          item[field] = doc["dense_embedding"]
+        elif field == "sparse_embedding_inner_product":
+          item[field] = doc["sparse_embedding"]
+        elif field == "sparse_embedding_bm25":
+          # It is automatically generated by Milvus from the content field.
+          continue
+        else:
+          item[field] = doc[field]
+      data_ready_to_index.append(item)
+
+    # Index data.
+    result = client.insert(
+        collection_name=collection_name, data=data_ready_to_index)
+
+    # Assert that the intended data has been properly indexed.
+    insertion_err = f'failed to insert the {result["insert_count"]} data points'
+    assert result["insert_count"] == len(data_ready_to_index), insertion_err
+
+    # Release the collection from memory. It will be loaded lazily when the
+    # enrichment handler is invoked.
+    client.release_collection(collection_name)
+
+    # Close the connection to the Milvus database, as no further preparation
+    # operations are needed  before executing the enrichment handler.
+    client.close()
+
+    return collection_name
+
   @staticmethod
   @contextlib.contextmanager
   def create_user_yaml(service_port: int, max_vector_field_num=5):

From c9f4b6cd3bc4339dee39440794c21e24f353e82d Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 11:35:52 +0000
Subject: [PATCH 28/35] sdks/python: fix linting issues

---
 .../apache_beam/ml/rag/enrichment/milvus_search_it_test.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
index 69a8df3026e9..34cb3f9050fc 100644
--- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
+++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-import logging
 import platform
 import unittest
 from dataclasses import dataclass
@@ -55,6 +54,7 @@
 except ImportError as e:
   raise unittest.SkipTest(f'Milvus dependencies not installed: {str(e)}')
 
+
 def _construct_index_params():
   index_params = IndexParams()
 

From 17ba353e2fcd29cf418fd97aac2d019b0884b379 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 17:42:48 +0000
Subject: [PATCH 29/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 4dec414070bb..e41a4d007b63 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -27,17 +27,18 @@
 from typing import cast
 
 import yaml
-from testcontainers.core.config import testcontainers_config
-from testcontainers.core.generic import DbContainer
-from testcontainers.milvus import MilvusContainer
 from pymilvus import CollectionSchema
 from pymilvus import FieldSchema
 from pymilvus import MilvusClient
 from pymilvus.exceptions import MilvusException
 from pymilvus.milvus_client import IndexParams
 
-from apache_beam.ml.rag.types import Chunk
+from testcontainers.core.config import testcontainers_config
+from testcontainers.core.generic import DbContainer
+from testcontainers.milvus import MilvusContainer
+
 from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
+from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.utils import retry_with_backoff
 
 _LOGGER = logging.getLogger(__name__)

From 560f926770dc7daec357cd8cc38d872bc51436e8 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 19:19:41 +0000
Subject: [PATCH 30/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index e41a4d007b63..d060a8f0ea1c 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -32,7 +32,6 @@
 from pymilvus import MilvusClient
 from pymilvus.exceptions import MilvusException
 from pymilvus.milvus_client import IndexParams
-
 from testcontainers.core.config import testcontainers_config
 from testcontainers.core.generic import DbContainer
 from testcontainers.milvus import MilvusContainer

From f72c2e1ab352e8e05b1a7c778946ed7eb6937d1c Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 19:46:34 +0000
Subject: [PATCH 31/35] CI: fix import errors in CI

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 26 +++++++++++---------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index d060a8f0ea1c..9fca8514e0c4 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -20,26 +20,30 @@
 import os
 import socket
 import tempfile
+import unittest
 from dataclasses import dataclass
 from typing import Callable
 from typing import List
 from typing import Optional
 from typing import cast
 
-import yaml
-from pymilvus import CollectionSchema
-from pymilvus import FieldSchema
-from pymilvus import MilvusClient
-from pymilvus.exceptions import MilvusException
-from pymilvus.milvus_client import IndexParams
-from testcontainers.core.config import testcontainers_config
-from testcontainers.core.generic import DbContainer
-from testcontainers.milvus import MilvusContainer
-
-from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.utils import retry_with_backoff
 
+try:
+  import yaml
+  from pymilvus import CollectionSchema
+  from pymilvus import FieldSchema
+  from pymilvus import MilvusClient
+  from pymilvus.exceptions import MilvusException
+  from pymilvus.milvus_client import IndexParams
+  from testcontainers.core.config import testcontainers_config
+  from testcontainers.core.generic import DbContainer
+  from testcontainers.milvus import MilvusContainer
+  from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
+except ImportError as e:
+  raise unittest.SkipTest(f'RAG test util dependencies not installed: {str(e)}')
+
 _LOGGER = logging.getLogger(__name__)
 
 

From 0ef2c7d89d3ddc074132ffa0bad3ef6d17e7c74a Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 20:48:44 +0000
Subject: [PATCH 32/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 9fca8514e0c4..293b92cbf4af 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -30,6 +30,7 @@
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.utils import retry_with_backoff
 
+# pylint: disable=ungrouped-imports
 try:
   import yaml
   from pymilvus import CollectionSchema

From 690fd72e846a6f09d18351b9e72bf497537c6335 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Thu, 6 Nov 2025 21:22:58 +0000
Subject: [PATCH 33/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 293b92cbf4af..77828f1cec5e 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -41,6 +41,7 @@
   from testcontainers.core.config import testcontainers_config
   from testcontainers.core.generic import DbContainer
   from testcontainers.milvus import MilvusContainer
+
   from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
 except ImportError as e:
   raise unittest.SkipTest(f'RAG test util dependencies not installed: {str(e)}')

From f01a7e5136165f846525634685aaf4c58ee715b6 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Mon, 10 Nov 2025 15:43:33 +0000
Subject: [PATCH 34/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 77828f1cec5e..7a08508a0b88 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -195,6 +195,7 @@ def stop_db_container(db_info: VectorDBContainerInfo):
     db_info.container.stop()
     _LOGGER.info("milvus db container stopped successfully.")
 
+  @staticmethod
   def initialize_db_with_data(
       connc_params: MilvusConnectionParameters, config: dict):
     # Open the connection to the milvus db with retry.

From 3145561ed1d16b33a8fad29e2348e75721827318 Mon Sep 17 00:00:00 2001
From: Mohamed Awnallah <mohamedmohey2352@gmail.com>
Date: Mon, 10 Nov 2025 16:26:48 +0000
Subject: [PATCH 35/35] sdks/python: fix linting issues

---
 sdks/python/apache_beam/ml/rag/test_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py
index 7a08508a0b88..f4acb105892c 100644
--- a/sdks/python/apache_beam/ml/rag/test_utils.py
+++ b/sdks/python/apache_beam/ml/rag/test_utils.py
@@ -407,3 +407,7 @@ def assert_chunks_equivalent(
           # Validate field metadata.
           err_msg = f"Field Metadata doesn't match for chunk {actual.id}"
           assert a_f['metadata'] == e_f['metadata'], err_msg
+
+
+if __name__ == '__main__':
+  unittest.main()