From 339754a4b8aa6b65cfcb563e11a80026fc9a15f4 Mon Sep 17 00:00:00 2001 From: sumanth-fw Date: Mon, 22 Dec 2025 17:31:12 +0530 Subject: [PATCH 1/3] feat(oss-opensearch): Introduce on-disk vector storage configuration. --- vectordb_bench/backend/clients/oss_opensearch/cli.py | 10 ++++++++++ .../backend/clients/oss_opensearch/config.py | 3 +++ .../backend/clients/oss_opensearch/oss_opensearch.py | 3 +++ vectordb_bench/frontend/config/dbCaseConfigs.py | 10 ++++++++++ vectordb_bench/models.py | 1 + 5 files changed, 27 insertions(+) diff --git a/vectordb_bench/backend/clients/oss_opensearch/cli.py b/vectordb_bench/backend/clients/oss_opensearch/cli.py index 804a4bc82..51828bbe9 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/cli.py +++ b/vectordb_bench/backend/clients/oss_opensearch/cli.py @@ -118,6 +118,16 @@ class OSSOpenSearchTypedDict(TypedDict): ), ] + on_disk: Annotated[ + bool, + click.option( + "--on-disk", + is_flag=True, + help="Enable on-disk vector storage mode", + default=False, + ), + ] + class OSSOpenSearchHNSWTypedDict(CommonTypedDict, OSSOpenSearchTypedDict, HNSWFlavor1): ... diff --git a/vectordb_bench/backend/clients/oss_opensearch/config.py b/vectordb_bench/backend/clients/oss_opensearch/config.py index 3f961bf09..343c29380 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/config.py +++ b/vectordb_bench/backend/clients/oss_opensearch/config.py @@ -71,6 +71,7 @@ class OSSOpenSearchIndexConfig(BaseModel, DBCaseConfig): force_merge_enabled: bool | None = True flush_threshold_size: str | None = "5120mb" index_thread_qty_during_force_merge: int = 8 + on_disk: bool = False cb_threshold: str | None = "50%" number_of_indexing_clients: int | None = 1 use_routing: bool = False # for label-filter cases @@ -107,6 +108,7 @@ def __eq__(self, obj: any): and self.replication_type == obj.replication_type and self.knn_derived_source_enabled == obj.knn_derived_source_enabled and self.memory_optimized_search == obj.memory_optimized_search + and self.on_disk == obj.on_disk ) def __hash__(self) -> int: @@ -123,6 +125,7 @@ def __hash__(self) -> int: self.replication_type, self.knn_derived_source_enabled, self.memory_optimized_search, + self.on_disk, ) ) diff --git a/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py b/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py index a790834de..3ea680339 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py +++ b/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py @@ -330,6 +330,9 @@ def _create_index(self, client: OpenSearch) -> None: "method": self.case_config.index_param(), } + if self.case_config.on_disk: + properties[self.vector_col_name]["mode"] = "on_disk" + mappings = { "properties": properties, } diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py index 9348c243e..15188dddd 100644 --- a/vectordb_bench/frontend/config/dbCaseConfigs.py +++ b/vectordb_bench/frontend/config/dbCaseConfigs.py @@ -1793,6 +1793,14 @@ class CaseConfigInput(BaseModel): }, ) +CaseConfigParamInput_ON_DISK_AWSOpensearch = CaseConfigInput( + label=CaseConfigParamType.on_disk, + displayLabel="On Disk", + inputHelp="Enable on-disk vector storage mode (The on_disk mode only works with the float data type.)", + inputType=InputType.Bool, + inputConfig={"value": False}, +) + CaseConfigParamInput_NUMBER_OF_INDEXING_CLIENTS_AWSOpensearch = CaseConfigInput( label=CaseConfigParamType.number_of_indexing_clients, displayLabel="Number of Indexing Clients", @@ -2337,6 +2345,7 @@ class CaseConfigInput(BaseModel): CaseConfigParamInput_REPLICATION_TYPE_AWSOpensearch, CaseConfigParamInput_MEMORY_OPTIMIZED_SEARCH_AWSOpensearch, CaseConfigParamInput_INDEX_THREAD_QTY_DURING_FORCE_MERGE_AWSOpensearch, + CaseConfigParamInput_ON_DISK_AWSOpensearch, ] AWSOpenSearchPerformanceConfig = [ @@ -2354,6 +2363,7 @@ class CaseConfigInput(BaseModel): CaseConfigParamInput_REPLICATION_TYPE_AWSOpensearch, CaseConfigParamInput_MEMORY_OPTIMIZED_SEARCH_AWSOpensearch, CaseConfigParamInput_INDEX_THREAD_QTY_DURING_FORCE_MERGE_AWSOpensearch, + CaseConfigParamInput_ON_DISK_AWSOpensearch, ] # Map DB to config diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index cce0fa116..d67754187 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -113,6 +113,7 @@ class CaseConfigParamType(Enum): num_sub_vectors = "num_sub_vectors" sample_rate = "sample_rate" index_thread_qty_during_force_merge = "index_thread_qty_during_force_merge" + on_disk = "on_disk" number_of_indexing_clients = "number_of_indexing_clients" number_of_shards = "number_of_shards" number_of_replicas = "number_of_replicas" From 972d8d64f53a2aa686265712cb4b48ea224afea9 Mon Sep 17 00:00:00 2001 From: sumanth-fw Date: Mon, 22 Dec 2025 17:52:44 +0530 Subject: [PATCH 2/3] feat(oss-opensearch): Enhance on-disk vector storage option with engine-specific display logic and improved help text. --- vectordb_bench/backend/clients/oss_opensearch/cli.py | 4 +++- vectordb_bench/frontend/config/dbCaseConfigs.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/vectordb_bench/backend/clients/oss_opensearch/cli.py b/vectordb_bench/backend/clients/oss_opensearch/cli.py index 51828bbe9..ee21a62fb 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/cli.py +++ b/vectordb_bench/backend/clients/oss_opensearch/cli.py @@ -123,8 +123,9 @@ class OSSOpenSearchTypedDict(TypedDict): click.option( "--on-disk", is_flag=True, - help="Enable on-disk vector storage mode", + help="Enable on-disk vector storage mode only for faiss engine (The on_disk mode only works with the float data type.)", default=False, + required=False, ), ] @@ -160,6 +161,7 @@ def OSSOpenSearch(**parameters: Unpack[OSSOpenSearchHNSWTypedDict]): M=parameters["m"], engine=OSSOS_Engine(parameters["engine"]), quantization_type=OSSOpenSearchQuantization(parameters["quantization_type"]), + on_disk=parameters["on_disk"], ), **parameters, ) diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py index 15188dddd..3dd0a2e9a 100644 --- a/vectordb_bench/frontend/config/dbCaseConfigs.py +++ b/vectordb_bench/frontend/config/dbCaseConfigs.py @@ -1799,6 +1799,7 @@ class CaseConfigInput(BaseModel): inputHelp="Enable on-disk vector storage mode (The on_disk mode only works with the float data type.)", inputType=InputType.Bool, inputConfig={"value": False}, + isDisplayed=lambda config: (config.get(CaseConfigParamType.engine_name, "").lower() == "faiss"), ) CaseConfigParamInput_NUMBER_OF_INDEXING_CLIENTS_AWSOpensearch = CaseConfigInput( From cd3079c7067e6c6d53e25beb5d0099a51bb94592 Mon Sep 17 00:00:00 2001 From: sumanth-fw Date: Mon, 22 Dec 2025 23:02:56 +0530 Subject: [PATCH 3/3] feat(oss-opensearch): Implement version-specific properties handling for on-disk storage mode support in OpenSearch. --- .../clients/oss_opensearch/oss_opensearch.py | 33 +++++++++++++++++-- .../frontend/config/dbCaseConfigs.py | 2 +- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py b/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py index 3ea680339..8949b8f98 100644 --- a/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py +++ b/vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py @@ -43,6 +43,17 @@ }, ] +VERSION_SPECIFIC_PROPERTIES_RULES = [ + { + "name": "mode", + "applies": lambda version, case_config: ( + version >= Version("2.17") + and case_config.engine == OSSOS_Engine.faiss + ), + "value": lambda case_config: "on_disk" if case_config.on_disk else "in_memory", + } +] + class OpenSearchError(Exception): """Custom exception for OpenSearch operations.""" @@ -274,6 +285,18 @@ def _get_version_specific_settings(self, cluster_version: Version) -> dict: value = setting["value"](self.case_config) version_specific_settings[name] = value return version_specific_settings + + def _get_version_specific_properties(self, cluster_version: Version) -> dict: + """ + Builds and returns a dictionary of applicable version-specific properties. + """ + version_specific_properties = {} + for property in VERSION_SPECIFIC_PROPERTIES_RULES: + if property["applies"](cluster_version, self.case_config): + name = property["name"] + value = property["value"](self.case_config) + version_specific_properties[name] = value + return version_specific_properties def _get_bulk_manager(self, client: OpenSearch) -> BulkInsertManager: """Get bulk insert manager for the given client.""" @@ -291,6 +314,8 @@ def _create_index(self, client: OpenSearch) -> None: log.info(f"All case_config parameters: {self.case_config.__dict__}") settings_manager = self._get_settings_manager(client) + cluster_version = self._get_cluster_version(client) + cluster_settings = { "knn.algo_param.index_thread_qty": self.case_config.index_thread_qty, "knn.memory.circuit_breaker.limit": self.case_config.cb_threshold, @@ -311,13 +336,14 @@ def _create_index(self, client: OpenSearch) -> None: } settings["index"]["knn.algo_param.ef_search"] = ef_search_value - version_specific_settings = self._get_version_specific_settings(self._get_cluster_version(client)) + version_specific_settings = self._get_version_specific_settings(cluster_version) if version_specific_settings: log.info(f"Applying version-dependent settings: {version_specific_settings}") settings["index"].update(version_specific_settings) # Build properties mapping, excluding _id which is automatically handled by OpenSearch properties = {} + version_specific_properties = self._get_version_specific_properties(cluster_version) # Only add id field to properties if it's not the special _id field if self.id_col_name != "_id": @@ -330,8 +356,9 @@ def _create_index(self, client: OpenSearch) -> None: "method": self.case_config.index_param(), } - if self.case_config.on_disk: - properties[self.vector_col_name]["mode"] = "on_disk" + # mode if supported by the version else ignore + if("mode" in version_specific_properties): + properties[self.vector_col_name]["mode"] = version_specific_properties["mode"] mappings = { "properties": properties, diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py index 3dd0a2e9a..4b2a17618 100644 --- a/vectordb_bench/frontend/config/dbCaseConfigs.py +++ b/vectordb_bench/frontend/config/dbCaseConfigs.py @@ -1796,7 +1796,7 @@ class CaseConfigInput(BaseModel): CaseConfigParamInput_ON_DISK_AWSOpensearch = CaseConfigInput( label=CaseConfigParamType.on_disk, displayLabel="On Disk", - inputHelp="Enable on-disk vector storage mode (The on_disk mode only works with the float data type.)", + inputHelp="Enable on-disk vector storage mode (The on_disk mode only works with the float data type.) Supported by OpenSearch >=2.17", inputType=InputType.Bool, inputConfig={"value": False}, isDisplayed=lambda config: (config.get(CaseConfigParamType.engine_name, "").lower() == "faiss"),