From cb68ff0955b4624b5cb2f55c7e01b20ad62a1cc6 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 25 Jul 2025 15:39:32 +0200 Subject: [PATCH 1/6] feat: embedder rework --- controller/record/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controller/record/manager.py b/controller/record/manager.py index e55aa269..ca618744 100644 --- a/controller/record/manager.py +++ b/controller/record/manager.py @@ -270,7 +270,7 @@ def __check_and_prep_edit_records( continue emb_path = os.path.join( - "/inference", project_id, f"embedder-{str(embedding_item.id)}.pkl" + "/inference", project_id, str(embedding_item.id), "reducer.pkl" ) if not os.path.exists(emb_path): errors_found.append( From 5a6dd288bc170c76d5c02f174a9532231ffafbe1 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 25 Jul 2025 15:54:26 +0200 Subject: [PATCH 2/6] perf: deprecate Cohere and Python embedders --- controller/embedding/manager.py | 1 - controller/embedding/terms.py | 10 ---------- controller/transfer/project_transfer_manager.py | 5 +---- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/controller/embedding/manager.py b/controller/embedding/manager.py index f15cf777..ae6de486 100644 --- a/controller/embedding/manager.py +++ b/controller/embedding/manager.py @@ -230,7 +230,6 @@ def __recreate_or_extend_embedding(project_id: str, embedding_id: str) -> Embedd if ( new_embedding_item.platform == enums.EmbeddingPlatform.OPENAI.value - or new_embedding_item.platform == enums.EmbeddingPlatform.COHERE.value or new_embedding_item.platform == enums.EmbeddingPlatform.AZURE.value ): agreement_item = agreement.get_by_xfkey( diff --git a/controller/embedding/terms.py b/controller/embedding/terms.py index c5085404..98217bcc 100644 --- a/controller/embedding/terms.py +++ b/controller/embedding/terms.py @@ -11,16 +11,6 @@ "terms": "Please note that by enabling this third-party API, you are stating that you accept its addition as a sub-processor under the terms of our Data Processing Agreement. Please be aware that the OpenAI API policies may conflict with your internal data and privacy policies. For more information please check: @@PLACEHOLDER@@. For questions you can contact us at security@kern.ai.", "link": "https://openai.com/policies/api-data-usage-policies", }, - EmbeddingPlatform.COHERE.value:{ - "platform": EmbeddingPlatform.COHERE.value, - "terms": "Please note that by enabling this third-party API, you are stating that you accept its addition as a sub-processor under the terms of our Data Processing Agreement. Please be aware that the Cohere API policies may conflict with your internal data and privacy policies. For more information please check: @@PLACEHOLDER@@. For questions you can contact us at security@kern.ai.", - "link": "https://cohere.com/terms-of-use", - }, - EmbeddingPlatform.PYTHON.value: { - "platform": EmbeddingPlatform.PYTHON.value, - "terms": None, - "link": None, - }, EmbeddingPlatform.AZURE.value: { "platform": EmbeddingPlatform.AZURE.value, "terms": "Please note that by enabling this third-party API, you are stating that you accept its addition as a sub-processor under the terms of our Data Processing Agreement. Please be aware that the Azure API policies may conflict with your internal data and privacy policies. For more information please check: @@PLACEHOLDER@@. For questions you can contact us at security@kern.ai.", diff --git a/controller/transfer/project_transfer_manager.py b/controller/transfer/project_transfer_manager.py index 4f638eb3..97556c60 100644 --- a/controller/transfer/project_transfer_manager.py +++ b/controller/transfer/project_transfer_manager.py @@ -339,10 +339,7 @@ def __transform_embedding_by_name(embedding_name: str): attribute_name = splitted_name[0] embedding_type = splitted_name[1] model = "-".join(splitted_name[2:]) - if "bag-of-words" == model or "bag-of-characters" == model or "tf-idf" == model: - platform = enums.EmbeddingPlatform.PYTHON.value - else: - platform = enums.EmbeddingPlatform.HUGGINGFACE.value + platform = enums.EmbeddingPlatform.HUGGINGFACE.value name = f"{attribute_name}-{embedding_type}-{platform}-{model}" return platform, model, name From a4d43279d0dff79fb0bd0210392d194a3a0d8180 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 25 Jul 2025 15:54:42 +0200 Subject: [PATCH 3/6] chore: update submodules --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 1198fda6..6ebe7108 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 1198fda6b1f475aaea1c9dfc0fde7177238fb30c +Subproject commit 6ebe7108363438f80fa44b39d0e6582f38e18b21 From 7789d2bfbbb669936010457ff04974ad832bbead Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 30 Jul 2025 09:45:21 +0200 Subject: [PATCH 4/6] fix: search for embedders.json instead of reducer.pkl --- controller/record/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controller/record/manager.py b/controller/record/manager.py index ca618744..26af98f5 100644 --- a/controller/record/manager.py +++ b/controller/record/manager.py @@ -270,7 +270,7 @@ def __check_and_prep_edit_records( continue emb_path = os.path.join( - "/inference", project_id, str(embedding_item.id), "reducer.pkl" + "/inference", project_id, str(embedding_item.id), "embedder.json" ) if not os.path.exists(emb_path): errors_found.append( From bf798bacaa20d2258d2e77ad086827a98d928b9f Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 30 Jul 2025 15:54:20 +0200 Subject: [PATCH 5/6] perf: update dump/load paths --- controller/record/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controller/record/manager.py b/controller/record/manager.py index 26af98f5..59262d41 100644 --- a/controller/record/manager.py +++ b/controller/record/manager.py @@ -270,7 +270,7 @@ def __check_and_prep_edit_records( continue emb_path = os.path.join( - "/inference", project_id, str(embedding_item.id), "embedder.json" + "/inference", project_id, f"embedder-{str(embedding_item.id)}.json" ) if not os.path.exists(emb_path): errors_found.append( From f2a3780e47ca43af4a493b413764e3d6ea66f586 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 30 Jul 2025 15:55:52 +0200 Subject: [PATCH 6/6] chore: update submodules --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 6ebe7108..b41145ac 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 6ebe7108363438f80fa44b39d0e6582f38e18b21 +Subproject commit b41145ac4d0284b68c65b88baff034123f5403a5