From e9589242deed3c7f719207d59117b06cdac74bd7 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 22 Jul 2025 19:24:42 +0200 Subject: [PATCH 01/12] fix: missing embedders module --- Dockerfile | 4 ++++ dev.Dockerfile | 4 ++++ gpu-requirements.txt | 23 +++++++++++++++++++---- gpu.Dockerfile | 4 ++++ requirements.txt | 23 +++++++++++++++++++---- requirements/gpu-requirements.in | 1 + requirements/requirements.in | 1 + 7 files changed, 52 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index a9c4c0f..67f0937 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,10 @@ WORKDIR /program COPY requirements.txt . +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* + RUN pip3 install --no-cache-dir -r requirements.txt COPY / . diff --git a/dev.Dockerfile b/dev.Dockerfile index 040d52b..fdd83d3 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -6,6 +6,10 @@ VOLUME ["/app"] COPY requirements*.txt . +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* + RUN pip3 install --no-cache-dir -r requirements-dev.txt COPY / . diff --git a/gpu-requirements.txt b/gpu-requirements.txt index ec5a3cd..a2b61a7 100644 --- a/gpu-requirements.txt +++ b/gpu-requirements.txt @@ -68,7 +68,9 @@ click==8.2.1 cloudpathlib==0.21.1 # via weasel cohere==5.16.1 - # via -r requirements/gpu-requirements.in + # via + # -r requirements/gpu-requirements.in + # embedders confection==0.1.5 # via # thinc @@ -78,6 +80,8 @@ cymem==2.0.11 # preshed # spacy # thinc +embedders @ git+https://github.com/code-kern-ai/embedders.git@python-upgrade + # via -r requirements/gpu-requirements.in fastapi==0.116.1 # via -r requirements/torch-cuda-requirements.txt fastavro==1.11.1 @@ -177,6 +181,7 @@ numpy==1.23.4 # via # -r requirements/torch-cuda-requirements.txt # blis + # embedders # pandas # scikit-learn # scikit-optimize @@ -186,7 +191,9 @@ numpy==1.23.4 # torchvision # transformers openai==0.28.1 - # via -r requirements/gpu-requirements.in + # via + # -r requirements/gpu-requirements.in + # embedders packaging==25.0 # via # -r requirements/torch-cuda-requirements.txt @@ -280,6 +287,7 @@ safetensors==0.5.3 scikit-learn==1.5.2 # via # -r requirements/torch-cuda-requirements.txt + # embedders # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 @@ -291,7 +299,9 @@ scipy==1.13.1 # scikit-optimize # sentence-transformers sentence-transformers==5.0.0 - # via -r requirements/gpu-requirements.in + # via + # -r requirements/gpu-requirements.in + # embedders shellingham==1.5.4 # via typer six==1.17.0 @@ -305,7 +315,9 @@ sniffio==1.3.1 # -r requirements/torch-cuda-requirements.txt # anyio spacy==3.7.5 - # via -r requirements/gpu-requirements.in + # via + # -r requirements/gpu-requirements.in + # embedders spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -340,6 +352,7 @@ tokenizers==0.21.2 torch==2.7.1 # via # -r requirements/torch-cuda-requirements.txt + # embedders # sentence-transformers # torchvision torchvision==0.22.1 @@ -347,6 +360,7 @@ torchvision==0.22.1 tqdm==4.67.1 # via # -r requirements/torch-cuda-requirements.txt + # embedders # huggingface-hub # openai # sentence-transformers @@ -355,6 +369,7 @@ tqdm==4.67.1 transformers==4.53.2 # via # -r requirements/torch-cuda-requirements.txt + # embedders # sentence-transformers typer==0.16.0 # via diff --git a/gpu.Dockerfile b/gpu.Dockerfile index c7a310e..f6834f9 100644 --- a/gpu.Dockerfile +++ b/gpu.Dockerfile @@ -4,6 +4,10 @@ WORKDIR /program COPY gpu-requirements.txt . +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* + RUN pip3 install --no-cache-dir -r gpu-requirements.txt COPY / . diff --git a/requirements.txt b/requirements.txt index a76401b..3826bff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,7 +68,9 @@ click==8.2.1 cloudpathlib==0.21.1 # via weasel cohere==5.16.1 - # via -r requirements/requirements.in + # via + # -r requirements/requirements.in + # embedders confection==0.1.5 # via # thinc @@ -78,6 +80,8 @@ cymem==2.0.11 # preshed # spacy # thinc +embedders @ git+https://github.com/code-kern-ai/embedders.git@python-upgrade + # via -r requirements/requirements.in fastapi==0.116.1 # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt fastavro==1.11.1 @@ -177,6 +181,7 @@ numpy==1.23.4 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # blis + # embedders # pandas # scikit-learn # scikit-optimize @@ -186,7 +191,9 @@ numpy==1.23.4 # torchvision # transformers openai==0.28.1 - # via -r requirements/requirements.in + # via + # -r requirements/requirements.in + # embedders packaging==25.0 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt @@ -280,6 +287,7 @@ safetensors==0.5.3 scikit-learn==1.5.2 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # embedders # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 @@ -291,7 +299,9 @@ scipy==1.13.1 # scikit-optimize # sentence-transformers sentence-transformers==5.0.0 - # via -r requirements/requirements.in + # via + # -r requirements/requirements.in + # embedders shellingham==1.5.4 # via typer six==1.17.0 @@ -305,7 +315,9 @@ sniffio==1.3.1 # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio spacy==3.7.5 - # via -r requirements/requirements.in + # via + # -r requirements/requirements.in + # embedders spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -340,6 +352,7 @@ tokenizers==0.21.2 torch==2.7.1 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # embedders # sentence-transformers # torchvision torchvision==0.22.1 @@ -347,6 +360,7 @@ torchvision==0.22.1 tqdm==4.67.1 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # embedders # huggingface-hub # openai # sentence-transformers @@ -355,6 +369,7 @@ tqdm==4.67.1 transformers==4.53.2 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # embedders # sentence-transformers typer==0.16.0 # via diff --git a/requirements/gpu-requirements.in b/requirements/gpu-requirements.in index 85e6908..7ccb53f 100644 --- a/requirements/gpu-requirements.in +++ b/requirements/gpu-requirements.in @@ -1,4 +1,5 @@ -r torch-cuda-requirements.txt +embedders@git+https://github.com/code-kern-ai/embedders.git@python-upgrade spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change diff --git a/requirements/requirements.in b/requirements/requirements.in index 15c7933..fc687a5 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,4 +1,5 @@ -r torch-cpu-requirements.txt +embedders@git+https://github.com/code-kern-ai/embedders.git@python-upgrade spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change From 865cadad2362c74a8b60ac699362b9c7291ec87f Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 25 Jul 2025 15:33:19 +0200 Subject: [PATCH 02/12] feat: embedder rework --- controller.py | 43 ++++++------- src/embedders/__init__.py | 28 ++++++++- src/embedders/classification/contextual.py | 70 ++++++++++++++------- src/embedders/classification/count_based.py | 56 +++++++++++++++-- src/embedders/classification/reduce.py | 29 +++++++++ src/embedders/util.py | 28 +++++++++ src/util/embedders.py | 10 +-- start | 1 + 8 files changed, 208 insertions(+), 57 deletions(-) diff --git a/controller.py b/controller.py index 260406e..2a449b9 100644 --- a/controller.py +++ b/controller.py @@ -4,7 +4,7 @@ from spacy.tokens import DocBin, Doc from spacy.vocab import Vocab -import pickle +import json import torch import traceback import logging @@ -14,8 +14,19 @@ import os import openai import pandas as pd +import shutil -from src.embedders import Transformer +from src.embedders import Transformer, util +from src.embedders.classification.contextual import ( + OpenAISentenceEmbedder, + HuggingFaceSentenceEmbedder, +) +from src.embedders.classification.count_based import ( + BagOfCharsSentenceEmbedder, + BagOfWordsSentenceEmbedder, + TfidfSentenceEmbedder, +) +from src.embedders.classification.reduce import PCASentenceReducer from src.util import daemon, request_util from src.util.decorator import param_throttle from src.util.embedders import get_embedder @@ -453,14 +464,7 @@ def run_encoding( request_util.post_embedding_to_neural_search(project_id, embedding_id) # now always since otherwise record edit wouldn't work for embedded columns - pickle_path = os.path.join( - "/inference", project_id, f"embedder-{embedding_id}.pkl" - ) - if not os.path.exists(pickle_path): - os.makedirs(os.path.dirname(pickle_path), exist_ok=True) - with open(pickle_path, "wb") as f: - pickle.dump(embedder, f) - + embedder.dump(project_id, embedding_id) upload_embedding_as_file(project_id, embedding_id) embedding.update_embedding_state_finished( project_id, @@ -490,9 +494,8 @@ def delete_embedding(project_id: str, embedding_id: str) -> int: org_id = organization.get_id_by_project_id(project_id) s3.delete_object(org_id, f"{project_id}/{object_name}") request_util.delete_embedding_from_neural_search(embedding_id) - pickle_path = os.path.join("/inference", project_id, f"embedder-{embedding_id}.pkl") - if os.path.exists(pickle_path): - os.remove(pickle_path) + json_path = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + shutil.rmtree(json_path.parent) return status.HTTP_200_OK @@ -629,15 +632,13 @@ def re_embed_records(project_id: str, changes: Dict[str, List[Dict[str, str]]]): def __setup_tmp_embedder(project_id: str, embedder_id: str) -> Transformer: - embedder_path = os.path.join( - "/inference", project_id, f"embedder-{embedder_id}.pkl" - ) - if not os.path.exists(embedder_path): + embedder_path = util.INFERENCE_DIR / project_id / embedder_id / "embedder.json" + if not embedder_path.exists(): raise Exception(f"Embedder {embedder_id} not found") - with open(embedder_path, "rb") as f: - embedder = pickle.load(f) - - return embedder + with open(embedder_path, "r") as f: + embedder = json.load(f) + Embedder = eval(embedder["cls"]) + return Embedder.load(embedder) def calc_tensors(project_id: str, embedding_id: str, texts: List[str]) -> List[Any]: diff --git a/src/embedders/__init__.py b/src/embedders/__init__.py index 7375321..b0b63cc 100644 --- a/src/embedders/__init__.py +++ b/src/embedders/__init__.py @@ -50,6 +50,31 @@ def get_warnings(self) -> Dict: """ pass + @abstractmethod + def load(self, embedder: dict) -> None: + """Loads the model configuration and weights from disk. + + Args: + project_id (str): The ID of the project. + embedding_id (str): The ID of the embedding. + """ + raise NotImplementedError + + @abstractmethod + def dump(self, project_id: str, embedding_id: str) -> None: + """Dumps the model configuration and weights to disk. + + Args: + project_id (str): The ID of the project. + embedding_id (str): The ID of the embedding. + """ + raise NotImplementedError + + @abstractmethod + def to_json(self) -> dict: + """Converts the embedder to a JSON serializable dictionary.""" + raise NotImplementedError + class Embedder(Transformer, metaclass=ABCMeta): def __init__(self): @@ -112,11 +137,12 @@ def __init__( embedder: Embedder, n_components: int = 8, autocorrect_n_components: bool = True, + reducer: PCA = None, **kwargs, ): super().__init__() self.embedder = embedder - self.reducer = PCA(n_components=n_components, **kwargs) + self.reducer = reducer or PCA(n_components=n_components, **kwargs) self.batch_size = self.embedder.batch_size self.autocorrect_n_components = autocorrect_n_components diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index 0a46c0f..9331b07 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -2,11 +2,11 @@ from sentence_transformers import SentenceTransformer from src.embedders import util from src.embedders.classification import SentenceEmbedder +from src.util import request_util from spacy.tokens.doc import Doc import torch import openai from openai import error as openai_error -import cohere import time @@ -34,6 +34,25 @@ class HuggingFaceSentenceEmbedder(TransformerSentenceEmbedder): def __init__(self, config_string: str, batch_size: int = 128): super().__init__(config_string, batch_size) + @staticmethod + def load(embedder: dict) -> "HuggingFaceSentenceEmbedder": + return HuggingFaceSentenceEmbedder( + config_string=request_util.get_model_path(embedder["model_name"]), + batch_size=embedder["batch_size"], + ) + + def to_json(self) -> dict: + return { + "cls": "HuggingFaceSentenceEmbedder", + "model_name": self.model.model_card_data.base_model, + "batch_size": self.batch_size, + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + util.write_json(self.to_json(), export_file, indent=2) + class OpenAISentenceEmbedder(SentenceEmbedder): def __init__( @@ -167,27 +186,30 @@ def _encode( "OpenAI API key is invalid. Please provide a valid API key in the constructor of OpenAISentenceEmbedder." ) + @staticmethod + def load(embedder: dict) -> "OpenAISentenceEmbedder": + return OpenAISentenceEmbedder( + model_name=embedder["model_name"], + batch_size=embedder["batch_size"], + openai_api_key=embedder["openai_api_key"], + api_base=embedder["api_base"], + api_type=embedder["api_type"], + api_version=embedder["api_version"], + ) -class CohereSentenceEmbedder(SentenceEmbedder): - def __init__(self, cohere_api_key: str, batch_size: int = 128): - super().__init__(batch_size) - self.cohere_api_key = cohere_api_key - self.model = cohere.Client(self.cohere_api_key) - - def __getstate__(self): - state = self.__dict__.copy() - # Don't pickle 'model' - del state["model"] - return state - - def __setstate__(self, state): - self.__dict__.update(state) - # Restore 'model' after unpickling - self.model = cohere.Client(self.cohere_api_key) - - def _encode( - self, documents: List[Union[str, Doc]], fit_model: bool - ) -> Generator[List[List[float]], None, None]: - for documents_batch in util.batch(documents, self.batch_size): - embeddings = self.model.embed(documents_batch).embeddings - yield embeddings + def to_json(self) -> dict: + return { + "cls": "OpenAISentenceEmbedder", + "model_name": self.model_name, + "batch_size": self.batch_size, + "openai_api_key": self.openai_api_key, + "api_base": self.api_base, + "api_type": self.api_type, + "api_version": self.api_version, + "use_azure": self.use_azure, + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + util.write_json(self.to_json(), export_file, indent=2) diff --git a/src/embedders/classification/count_based.py b/src/embedders/classification/count_based.py index a9f53d3..e80738a 100644 --- a/src/embedders/classification/count_based.py +++ b/src/embedders/classification/count_based.py @@ -23,6 +23,32 @@ def _encode( ) yield documents_batch_embedded + @classmethod + def load(cls, embedder: dict) -> "CountSentenceEmbedder": + model = util.read_pickle(embedder["model_pkl"]) + return cls( + model=model, + batch_size=embedder["batch_size"], + min_df=embedder["min_df"], + ) + + def to_json(self) -> dict: + return { + "cls": self.__class__.__name__, + "batch_size": self.batch_size, + "min_df": self.model.min_df, + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + pkl_file = util.INFERENCE_DIR / project_id / embedding_id / "python.pkl" + util.write_pickle(self.model, pkl_file) + + json_obj = self.to_json() + json_obj["model_pkl"] = str(pkl_file) + util.write_json(json_obj, export_file, indent=2) + class BagOfCharsSentenceEmbedder(CountSentenceEmbedder): """Embeds documents using plain Bag of Characters approach. @@ -32,9 +58,15 @@ class BagOfCharsSentenceEmbedder(CountSentenceEmbedder): min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. """ - def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): + def __init__( + self, + model: CountVectorizer = None, + batch_size: int = 128, + min_df: float = 0.1, + **kwargs, + ): super().__init__(batch_size, min_df) - self.model = CountVectorizer(analyzer="char", min_df=min_df, **kwargs) + self.model = model or CountVectorizer(analyzer="char", min_df=min_df, **kwargs) class BagOfWordsSentenceEmbedder(CountSentenceEmbedder): @@ -45,9 +77,15 @@ class BagOfWordsSentenceEmbedder(CountSentenceEmbedder): min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. """ - def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): + def __init__( + self, + model: CountVectorizer = None, + batch_size: int = 128, + min_df: float = 0.1, + **kwargs, + ): super().__init__(batch_size, min_df) - self.model = CountVectorizer(min_df=min_df, **kwargs) + self.model = model or CountVectorizer(min_df=min_df, **kwargs) class TfidfSentenceEmbedder(CountSentenceEmbedder): @@ -58,6 +96,12 @@ class TfidfSentenceEmbedder(CountSentenceEmbedder): min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. """ - def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): + def __init__( + self, + model: CountVectorizer = None, + batch_size: int = 128, + min_df: float = 0.1, + **kwargs, + ): super().__init__(batch_size, min_df) - self.model = TfidfVectorizer(min_df=min_df, **kwargs) + self.model = model or TfidfVectorizer(min_df=min_df, **kwargs) diff --git a/src/embedders/classification/reduce.py b/src/embedders/classification/reduce.py index 5bc226a..c3c4c2e 100644 --- a/src/embedders/classification/reduce.py +++ b/src/embedders/classification/reduce.py @@ -2,6 +2,10 @@ from typing import Union, List, Generator import numpy as np from src.embedders import PCAReducer, util +from src.embedders.classification.contextual import ( + OpenAISentenceEmbedder, + HuggingFaceSentenceEmbedder, +) class PCASentenceReducer(PCAReducer): @@ -50,3 +54,28 @@ def _reduce( else: embeddings = self.embedder.transform(documents) yield self._transform(embeddings) + + @staticmethod + def load(embedder: dict) -> "PCASentenceReducer": + reducer = util.read_pickle(embedder["reducer_pkl"]) + Embedder = eval(embedder["embedder"]["cls"]) + return PCASentenceReducer( + embedder=Embedder.load(embedder["embedder"]), + reducer=reducer, + ) + + def to_json(self) -> dict: + return { + "cls": "PCASentenceReducer", + "embedder": self.embedder.to_json(), + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + pkl_file = util.INFERENCE_DIR / project_id / embedding_id / "reducer.pkl" + util.write_pickle(self.reducer, pkl_file) + + json_obj = self.to_json() + json_obj["reducer_pkl"] = str(pkl_file) + util.write_json(json_obj, export_file, indent=2) diff --git a/src/embedders/util.py b/src/embedders/util.py index 4b8c1b2..81feacf 100644 --- a/src/embedders/util.py +++ b/src/embedders/util.py @@ -1,5 +1,13 @@ from typing import Any, Generator, List +from pathlib import Path import numpy as np +import os +import json +import pickle + +INFERENCE_DIR = Path( + os.getenv("INFERENCE_DIR", "/Users/andhrelja/Projects/dev-setup/inference") +) def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, None]: @@ -10,3 +18,23 @@ def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, N def num_batches(documents: List[Any], batch_size: int) -> int: return int(np.ceil(len(documents) / batch_size)) + + +def read_pickle(file_path: str) -> Any: + with open(file_path, "rb") as f: + return pickle.load(f) + + +def write_pickle(obj: Any, file_path: str, **kwargs) -> None: + with open(file_path, "wb") as f: + pickle.dump(obj, f, **kwargs) + + +def read_json(file_path: str) -> Any: + with open(file_path, "r") as f: + return json.load(f) + + +def write_json(obj: Any, file_path: str, **kwargs) -> None: + with open(file_path, "w") as f: + json.dump(obj, f, **kwargs) diff --git a/src/util/embedders.py b/src/util/embedders.py index cd3d41a..abb7dd4 100644 --- a/src/util/embedders.py +++ b/src/util/embedders.py @@ -7,7 +7,7 @@ from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, - CohereSentenceEmbedder, + # CohereSentenceEmbedder, ) from src.embedders.extraction.count_based import BagOfCharsTokenEmbedder from src.embedders.extraction.contextual import TransformerTokenEmbedder @@ -58,10 +58,10 @@ def get_embedder( embedder = HuggingFaceSentenceEmbedder( config_string=model, batch_size=batch_size ) - elif platform == enums.EmbeddingPlatform.COHERE.value: - embedder = CohereSentenceEmbedder( - cohere_api_key=api_token, batch_size=batch_size - ) + # elif platform == enums.EmbeddingPlatform.COHERE.value: + # embedder = CohereSentenceEmbedder( + # cohere_api_key=api_token, batch_size=batch_size + # ) else: raise Exception(f"Unknown platform {platform}") diff --git a/start b/start index e803302..6f83e0f 100755 --- a/start +++ b/start @@ -68,6 +68,7 @@ docker run -d --rm \ -e MODEL_PROVIDER=http://refinery-model-provider:80 \ -e WS_NOTIFY_ENDPOINT="http://refinery-websocket:8080" \ -e NEURAL_SEARCH=http://refinery-neural-search:80 \ +-e INFERENCE_DIR=/inference \ --mount type=bind,source="$(pwd)"/,target=/app \ -v /var/run/docker.sock:/var/run/docker.sock \ -v "$MODEL_DIR":/models \ From 27e17b7c4495f38fe8c8e0088e388dff453fb304 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 25 Jul 2025 15:59:07 +0200 Subject: [PATCH 03/12] perf: remove cohere requirements --- requirements.txt | 158 ++++++++++++------------------- requirements/gpu-requirements.in | 4 +- requirements/requirements.in | 4 +- 3 files changed, 62 insertions(+), 104 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3826bff..adf336a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,30 +14,29 @@ aiosignal==1.4.0 # via aiohttp annotated-types==0.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # pydantic anyio==4.9.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpx + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # starlette argon2-cffi==25.1.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # minio argon2-cffi-bindings==21.2.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # argon2-cffi attrs==25.3.0 # via aiohttp blis==0.7.11 # via thinc boto3==1.39.6 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt botocore==1.39.10 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 # s3transfer catalogue==2.0.10 @@ -47,30 +46,24 @@ catalogue==2.0.10 # thinc certifi==2025.7.14 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpcore - # httpx + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # minio # requests cffi==1.17.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # argon2-cffi-bindings charset-normalizer==3.4.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # requests click==8.2.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # typer # uvicorn cloudpathlib==0.21.1 # via weasel -cohere==5.16.1 - # via - # -r requirements/requirements.in - # embedders confection==0.1.5 # via # thinc @@ -80,15 +73,11 @@ cymem==2.0.11 # preshed # spacy # thinc -embedders @ git+https://github.com/code-kern-ai/embedders.git@python-upgrade - # via -r requirements/requirements.in fastapi==0.116.1 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt -fastavro==1.11.1 - # via cohere + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt filelock==3.18.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # torch # transformers @@ -98,50 +87,42 @@ frozenlist==1.7.0 # aiosignal fsspec==2025.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # torch h11==0.16.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpcore + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # uvicorn hf-xet==1.1.5 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub -httpcore==1.0.9 - # via httpx -httpx==0.28.1 - # via cohere -httpx-sse==0.4.0 - # via cohere huggingface-hub==0.33.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # sentence-transformers # tokenizers # transformers idna==3.10 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio - # httpx # requests # yarl jinja2==3.1.6 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # spacy # torch jmespath==1.0.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 # botocore joblib==1.5.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize langcodes==3.5.0 @@ -154,15 +135,15 @@ markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # jinja2 mdurl==0.1.2 # via markdown-it-py minio==7.2.15 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt mpmath==1.3.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # sympy multidict==6.6.3 # via @@ -175,13 +156,12 @@ murmurhash==1.0.13 # thinc networkx==3.5 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # torch numpy==1.23.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # blis - # embedders # pandas # scikit-learn # scikit-optimize @@ -191,19 +171,17 @@ numpy==1.23.4 # torchvision # transformers openai==0.28.1 - # via - # -r requirements/requirements.in - # embedders + # via -r requirements/requirements.in packaging==25.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # spacy # thinc # transformers # weasel pandas==1.5.1 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt pillow==11.3.0 # via # sentence-transformers @@ -217,23 +195,22 @@ propcache==0.3.2 # aiohttp # yarl psycopg2-binary==2.9.9 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt pyaml==25.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-optimize pycparser==2.22 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # cffi pycryptodome==3.23.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # minio pydantic==2.7.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # confection # fastapi # spacy @@ -241,89 +218,82 @@ pydantic==2.7.4 # weasel pydantic-core==2.18.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # pydantic pygments==2.19.2 # via rich python-dateutil==2.9.0.post0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # botocore # pandas pytz==2025.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # pandas pyyaml==6.0.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # pyaml # transformers regex==2024.11.6 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # transformers requests==2.32.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # openai # spacy # transformers # weasel -rich==14.0.0 +rich==14.1.0 # via typer s3transfer==0.13.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 safetensors==0.5.3 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # transformers scikit-learn==1.5.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt scipy==1.13.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize # sentence-transformers sentence-transformers==5.0.0 - # via - # -r requirements/requirements.in - # embedders + # via -r requirements/requirements.in shellingham==1.5.4 # via typer six==1.17.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # python-dateutil smart-open==7.3.0.post1 # via weasel sniffio==1.3.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio spacy==3.7.5 - # via - # -r requirements/requirements.in - # embedders + # via -r requirements/requirements.in spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 # via spacy sqlalchemy==1.4.42 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt srsly==2.5.1 # via # confection @@ -332,35 +302,32 @@ srsly==2.5.1 # weasel starlette==0.47.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # fastapi sympy==1.14.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # torch thinc==8.2.5 # via spacy threadpoolctl==3.6.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn tokenizers==0.21.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # transformers torch==2.7.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # sentence-transformers # torchvision torchvision==0.22.1 # via -r requirements/requirements.in tqdm==4.67.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # openai # sentence-transformers @@ -368,21 +335,17 @@ tqdm==4.67.1 # transformers transformers==4.53.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # sentence-transformers typer==0.16.0 # via # spacy # weasel -types-requests==2.32.4.20250611 - # via cohere typing-extensions==4.14.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # aiosignal # anyio - # cohere # fastapi # huggingface-hub # minio @@ -394,13 +357,12 @@ typing-extensions==4.14.1 # typer urllib3==2.5.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt # botocore # minio # requests - # types-requests uvicorn==0.35.0 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt wasabi==1.1.3 # via # spacy diff --git a/requirements/gpu-requirements.in b/requirements/gpu-requirements.in index 7ccb53f..ee09aa1 100644 --- a/requirements/gpu-requirements.in +++ b/requirements/gpu-requirements.in @@ -1,7 +1,5 @@ -r torch-cuda-requirements.txt -embedders@git+https://github.com/code-kern-ai/embedders.git@python-upgrade spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -cohere==5.16.1 \ No newline at end of file +openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib \ No newline at end of file diff --git a/requirements/requirements.in b/requirements/requirements.in index fc687a5..42b8d96 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,7 +1,5 @@ -r torch-cpu-requirements.txt -embedders@git+https://github.com/code-kern-ai/embedders.git@python-upgrade spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -cohere==5.16.1 \ No newline at end of file +openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib \ No newline at end of file From 96f6a28f97eea07d32cef1ae897e6307c8eae5ce Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 25 Jul 2025 15:59:19 +0200 Subject: [PATCH 04/12] perf: remove cohere + python embedder support --- controller.py | 1 - src/util/embedders.py | 45 +++++++------------------------------------ 2 files changed, 7 insertions(+), 39 deletions(-) diff --git a/controller.py b/controller.py index 2a449b9..d25551f 100644 --- a/controller.py +++ b/controller.py @@ -419,7 +419,6 @@ def run_encoding( elif platform == enums.EmbeddingPlatform.AZURE.value: notification_message = "Access denied due to invalid subscription key or wrong endpoint data." elif error_message == "invalid api token": - # cohere notification_message = "Access denied due to invalid api token." notification.create( project_id, diff --git a/src/util/embedders.py b/src/util/embedders.py index abb7dd4..81d1c40 100644 --- a/src/util/embedders.py +++ b/src/util/embedders.py @@ -1,15 +1,8 @@ from typing import Optional -from src.embedders.classification.count_based import ( - BagOfCharsSentenceEmbedder, - BagOfWordsSentenceEmbedder, - TfidfSentenceEmbedder, -) from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, - # CohereSentenceEmbedder, ) -from src.embedders.extraction.count_based import BagOfCharsTokenEmbedder from src.embedders.extraction.contextual import TransformerTokenEmbedder from src.embedders.classification.reduce import PCASentenceReducer from src.embedders.extraction.reduce import PCATokenReducer @@ -31,16 +24,7 @@ def get_embedder( if embedding_type == enums.EmbeddingType.ON_ATTRIBUTE.value: batch_size = 128 n_components = 64 - if platform == enums.EmbeddingPlatform.PYTHON.value: - if model == "bag-of-characters": - return BagOfCharsSentenceEmbedder(batch_size=batch_size) - elif model == "bag-of-words": - embedder = BagOfWordsSentenceEmbedder(batch_size=batch_size) - elif model == "tf-idf": - return TfidfSentenceEmbedder(batch_size=batch_size, min_df=0) - else: - raise Exception(f"Unknown model {model}") - elif ( + if ( platform == enums.EmbeddingPlatform.OPENAI.value or platform == enums.EmbeddingPlatform.AZURE.value ): @@ -58,10 +42,6 @@ def get_embedder( embedder = HuggingFaceSentenceEmbedder( config_string=model, batch_size=batch_size ) - # elif platform == enums.EmbeddingPlatform.COHERE.value: - # embedder = CohereSentenceEmbedder( - # cohere_api_key=api_token, batch_size=batch_size - # ) else: raise Exception(f"Unknown platform {platform}") @@ -74,23 +54,12 @@ def get_embedder( else: # extraction batch_size = 32 n_components = 16 - if model == "bag-of-characters": - return BagOfCharsTokenEmbedder( + return PCATokenReducer( + TransformerTokenEmbedder( + config_string=model, language_code=language_code, precomputed_docs=True, batch_size=batch_size, - ) - if model == "bag-of-words": - return None - if model == "tf-idf": - return None - else: - return PCATokenReducer( - TransformerTokenEmbedder( - config_string=model, - language_code=language_code, - precomputed_docs=True, - batch_size=batch_size, - ), - n_components=n_components, - ) + ), + n_components=n_components, + ) From e3879fa4425b97aec587c8451d707492cd2f2fa6 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 25 Jul 2025 15:59:24 +0200 Subject: [PATCH 05/12] chore: update submodules --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index c5663e9..6ebe710 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit c5663e9c855c30b431d106811a8330301c84eb24 +Subproject commit 6ebe7108363438f80fa44b39d0e6582f38e18b21 From ccbef0dbd46d17fa239dbca1e9ca7a3df14fb44f Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Mon, 28 Jul 2025 15:06:11 +0200 Subject: [PATCH 06/12] perf: support openai > 1.0.0 --- gpu-requirements.txt | 72 ++-------- requirements.txt | 156 ++++++++++----------- requirements/gpu-requirements.in | 2 +- requirements/requirements.in | 2 +- src/embedders/classification/contextual.py | 52 +++---- 5 files changed, 110 insertions(+), 174 deletions(-) diff --git a/gpu-requirements.txt b/gpu-requirements.txt index a2b61a7..a0755d5 100644 --- a/gpu-requirements.txt +++ b/gpu-requirements.txt @@ -6,12 +6,6 @@ # --extra-index-url https://download.pytorch.org/whl/cu113 -aiohappyeyeballs==2.6.1 - # via aiohttp -aiohttp==3.12.14 - # via openai -aiosignal==1.4.0 - # via aiohttp annotated-types==0.7.0 # via # -r requirements/torch-cuda-requirements.txt @@ -20,6 +14,7 @@ anyio==4.9.0 # via # -r requirements/torch-cuda-requirements.txt # httpx + # openai # starlette argon2-cffi==25.1.0 # via @@ -29,8 +24,6 @@ argon2-cffi-bindings==21.2.0 # via # -r requirements/torch-cuda-requirements.txt # argon2-cffi -attrs==25.3.0 - # via aiohttp blis==0.7.11 # via thinc boto3==1.39.6 @@ -67,10 +60,6 @@ click==8.2.1 # uvicorn cloudpathlib==0.21.1 # via weasel -cohere==5.16.1 - # via - # -r requirements/gpu-requirements.in - # embedders confection==0.1.5 # via # thinc @@ -80,22 +69,16 @@ cymem==2.0.11 # preshed # spacy # thinc -embedders @ git+https://github.com/code-kern-ai/embedders.git@python-upgrade - # via -r requirements/gpu-requirements.in +distro==1.9.0 + # via openai fastapi==0.116.1 # via -r requirements/torch-cuda-requirements.txt -fastavro==1.11.1 - # via cohere filelock==3.18.0 # via # -r requirements/torch-cuda-requirements.txt # huggingface-hub # torch # transformers -frozenlist==1.7.0 - # via - # aiohttp - # aiosignal fsspec==2025.7.0 # via # -r requirements/torch-cuda-requirements.txt @@ -113,9 +96,7 @@ hf-xet==1.1.5 httpcore==1.0.9 # via httpx httpx==0.28.1 - # via cohere -httpx-sse==0.4.0 - # via cohere + # via openai huggingface-hub==0.33.4 # via # -r requirements/torch-cuda-requirements.txt @@ -128,12 +109,13 @@ idna==3.10 # anyio # httpx # requests - # yarl jinja2==3.1.6 # via # -r requirements/torch-cuda-requirements.txt # spacy # torch +jiter==0.10.0 + # via openai jmespath==1.0.1 # via # -r requirements/torch-cuda-requirements.txt @@ -164,10 +146,6 @@ mpmath==1.3.0 # via # -r requirements/torch-cuda-requirements.txt # sympy -multidict==6.6.3 - # via - # aiohttp - # yarl murmurhash==1.0.13 # via # preshed @@ -181,7 +159,6 @@ numpy==1.23.4 # via # -r requirements/torch-cuda-requirements.txt # blis - # embedders # pandas # scikit-learn # scikit-optimize @@ -190,10 +167,8 @@ numpy==1.23.4 # thinc # torchvision # transformers -openai==0.28.1 - # via - # -r requirements/gpu-requirements.in - # embedders +openai==1.97.1 + # via -r requirements/gpu-requirements.in packaging==25.0 # via # -r requirements/torch-cuda-requirements.txt @@ -212,10 +187,6 @@ preshed==3.0.10 # via # spacy # thinc -propcache==0.3.2 - # via - # aiohttp - # yarl psycopg2-binary==2.9.9 # via -r requirements/torch-cuda-requirements.txt pyaml==25.7.0 @@ -233,16 +204,15 @@ pycryptodome==3.23.0 pydantic==2.7.4 # via # -r requirements/torch-cuda-requirements.txt - # cohere # confection # fastapi + # openai # spacy # thinc # weasel pydantic-core==2.18.4 # via # -r requirements/torch-cuda-requirements.txt - # cohere # pydantic pygments==2.19.2 # via rich @@ -268,9 +238,7 @@ regex==2024.11.6 requests==2.32.4 # via # -r requirements/torch-cuda-requirements.txt - # cohere # huggingface-hub - # openai # spacy # transformers # weasel @@ -287,7 +255,6 @@ safetensors==0.5.3 scikit-learn==1.5.2 # via # -r requirements/torch-cuda-requirements.txt - # embedders # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 @@ -299,9 +266,7 @@ scipy==1.13.1 # scikit-optimize # sentence-transformers sentence-transformers==5.0.0 - # via - # -r requirements/gpu-requirements.in - # embedders + # via -r requirements/gpu-requirements.in shellingham==1.5.4 # via typer six==1.17.0 @@ -314,10 +279,9 @@ sniffio==1.3.1 # via # -r requirements/torch-cuda-requirements.txt # anyio + # openai spacy==3.7.5 - # via - # -r requirements/gpu-requirements.in - # embedders + # via -r requirements/gpu-requirements.in spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -347,12 +311,10 @@ threadpoolctl==3.6.0 tokenizers==0.21.2 # via # -r requirements/torch-cuda-requirements.txt - # cohere # transformers torch==2.7.1 # via # -r requirements/torch-cuda-requirements.txt - # embedders # sentence-transformers # torchvision torchvision==0.22.1 @@ -360,7 +322,6 @@ torchvision==0.22.1 tqdm==4.67.1 # via # -r requirements/torch-cuda-requirements.txt - # embedders # huggingface-hub # openai # sentence-transformers @@ -369,23 +330,19 @@ tqdm==4.67.1 transformers==4.53.2 # via # -r requirements/torch-cuda-requirements.txt - # embedders # sentence-transformers typer==0.16.0 # via # spacy # weasel -types-requests==2.32.4.20250611 - # via cohere typing-extensions==4.14.1 # via # -r requirements/torch-cuda-requirements.txt - # aiosignal # anyio - # cohere # fastapi # huggingface-hub # minio + # openai # pydantic # pydantic-core # sentence-transformers @@ -398,7 +355,6 @@ urllib3==2.5.0 # botocore # minio # requests - # types-requests uvicorn==0.35.0 # via -r requirements/torch-cuda-requirements.txt wasabi==1.1.3 @@ -410,8 +366,6 @@ weasel==0.4.1 # via spacy wrapt==1.17.2 # via smart-open -yarl==1.20.1 - # via aiohttp # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/requirements.txt b/requirements.txt index adf336a..9bc9341 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,37 +6,31 @@ # --extra-index-url https://download.pytorch.org/whl/cpu -aiohappyeyeballs==2.6.1 - # via aiohttp -aiohttp==3.12.14 - # via openai -aiosignal==1.4.0 - # via aiohttp annotated-types==0.7.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # pydantic anyio==4.9.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt + # httpx + # openai # starlette argon2-cffi==25.1.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # minio argon2-cffi-bindings==21.2.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # argon2-cffi -attrs==25.3.0 - # via aiohttp blis==0.7.11 # via thinc boto3==1.39.6 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt botocore==1.39.10 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # boto3 # s3transfer catalogue==2.0.10 @@ -46,20 +40,22 @@ catalogue==2.0.10 # thinc certifi==2025.7.14 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt + # httpcore + # httpx # minio # requests cffi==1.17.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # argon2-cffi-bindings charset-normalizer==3.4.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # requests click==8.2.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # typer # uvicorn cloudpathlib==0.21.1 @@ -73,56 +69,61 @@ cymem==2.0.11 # preshed # spacy # thinc +distro==1.9.0 + # via openai fastapi==0.116.1 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt filelock==3.18.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # torch # transformers -frozenlist==1.7.0 - # via - # aiohttp - # aiosignal fsspec==2025.7.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # torch h11==0.16.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt + # httpcore # uvicorn hf-xet==1.1.5 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via openai huggingface-hub==0.33.4 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sentence-transformers # tokenizers # transformers idna==3.10 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # anyio + # httpx # requests - # yarl jinja2==3.1.6 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # spacy # torch +jiter==0.10.0 + # via openai jmespath==1.0.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # boto3 # botocore joblib==1.5.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize langcodes==3.5.0 @@ -135,20 +136,16 @@ markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # jinja2 mdurl==0.1.2 # via markdown-it-py minio==7.2.15 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt mpmath==1.3.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sympy -multidict==6.6.3 - # via - # aiohttp - # yarl murmurhash==1.0.13 # via # preshed @@ -156,11 +153,11 @@ murmurhash==1.0.13 # thinc networkx==3.5 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # torch numpy==1.23.4 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # blis # pandas # scikit-learn @@ -170,18 +167,18 @@ numpy==1.23.4 # thinc # torchvision # transformers -openai==0.28.1 +openai==1.97.1 # via -r requirements/requirements.in packaging==25.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # spacy # thinc # transformers # weasel pandas==1.5.1 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt pillow==11.3.0 # via # sentence-transformers @@ -190,62 +187,58 @@ preshed==3.0.10 # via # spacy # thinc -propcache==0.3.2 - # via - # aiohttp - # yarl psycopg2-binary==2.9.9 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt pyaml==25.7.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-optimize pycparser==2.22 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # cffi pycryptodome==3.23.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # minio pydantic==2.7.4 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # confection # fastapi + # openai # spacy # thinc # weasel pydantic-core==2.18.4 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # pydantic pygments==2.19.2 # via rich python-dateutil==2.9.0.post0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # botocore # pandas pytz==2025.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # pandas pyyaml==6.0.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # pyaml # transformers regex==2024.11.6 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # transformers requests==2.32.4 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub - # openai # spacy # transformers # weasel @@ -253,22 +246,22 @@ rich==14.1.0 # via typer s3transfer==0.13.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # boto3 safetensors==0.5.3 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # transformers scikit-learn==1.5.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt scipy==1.13.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize # sentence-transformers @@ -278,14 +271,15 @@ shellingham==1.5.4 # via typer six==1.17.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # python-dateutil smart-open==7.3.0.post1 # via weasel sniffio==1.3.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # anyio + # openai spacy==3.7.5 # via -r requirements/requirements.in spacy-legacy==3.0.12 @@ -293,7 +287,7 @@ spacy-legacy==3.0.12 spacy-loggers==1.0.5 # via spacy sqlalchemy==1.4.42 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt srsly==2.5.1 # via # confection @@ -302,32 +296,32 @@ srsly==2.5.1 # weasel starlette==0.47.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # fastapi sympy==1.14.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # torch thinc==8.2.5 # via spacy threadpoolctl==3.6.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-learn tokenizers==0.21.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # transformers torch==2.7.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sentence-transformers # torchvision torchvision==0.22.1 # via -r requirements/requirements.in tqdm==4.67.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # openai # sentence-transformers @@ -335,7 +329,7 @@ tqdm==4.67.1 # transformers transformers==4.53.2 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sentence-transformers typer==0.16.0 # via @@ -343,12 +337,12 @@ typer==0.16.0 # weasel typing-extensions==4.14.1 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt - # aiosignal + # -r requirements/torch-cpu-requirements.txt # anyio # fastapi # huggingface-hub # minio + # openai # pydantic # pydantic-core # sentence-transformers @@ -357,12 +351,12 @@ typing-extensions==4.14.1 # typer urllib3==2.5.0 # via - # -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # botocore # minio # requests uvicorn==0.35.0 - # via -r /Users/andhrelja/Projects/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt wasabi==1.1.3 # via # spacy @@ -372,8 +366,6 @@ weasel==0.4.1 # via spacy wrapt==1.17.2 # via smart-open -yarl==1.20.1 - # via aiohttp # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/requirements/gpu-requirements.in b/requirements/gpu-requirements.in index ee09aa1..50ca41a 100644 --- a/requirements/gpu-requirements.in +++ b/requirements/gpu-requirements.in @@ -2,4 +2,4 @@ spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib \ No newline at end of file +openai==1.97.1 \ No newline at end of file diff --git a/requirements/requirements.in b/requirements/requirements.in index 42b8d96..6f337ed 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -2,4 +2,4 @@ spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib \ No newline at end of file +openai==1.97.1 \ No newline at end of file diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index 9331b07..bba8aad 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -5,8 +5,8 @@ from src.util import request_util from spacy.tokens.doc import Doc import torch -import openai -from openai import error as openai_error +from openai import OpenAI, AzureOpenAI +from openai import AuthenticationError, RateLimitError import time @@ -104,7 +104,6 @@ def __init__( super().__init__(batch_size) self.model_name = model_name self.openai_api_key = openai_api_key - openai.api_key = self.openai_api_key self.api_base = api_base self.api_type = api_type self.api_version = api_version @@ -123,27 +122,18 @@ def __init__( and api_base is not None ), "If you want to use Azure, you need to provide api_type, api_version and api_base." - openai.api_base = api_base - openai.api_type = api_type - openai.api_version = api_version - - def __getstate__(self): - state = self.__dict__.copy() - return state - - def __setstate__(self, state): - self.__dict__.update(state) - self.model_name = state["model_name"] - self.openai_api_key = state["openai_api_key"] - openai.api_key = self.openai_api_key - self.use_azure = state.get("use_azure") + @property + def openai_client(self): if self.use_azure: - self.api_base = state["api_base"] - self.api_type = state["api_type"] - self.api_version = state["api_version"] - openai.api_base = self.api_base - openai.api_type = self.api_type - openai.api_version = self.api_version + return AzureOpenAI( + api_key=self.openai_api_key, + azure_endpoint=self.api_base, + api_version=self.api_version, + ) + return OpenAI( + api_key=self.openai_api_key, + base_url=self.api_base, + ) def _encode( self, documents: List[Union[str, Doc]], fit_model: bool @@ -159,11 +149,11 @@ def _encode( while True and count < 60: try: count += 1 - response = openai.Embedding.create( - input=azure_batch, engine=self.model_name + response = self.openai_client.embeddings.create( + input=azure_batch, model=self.model_name ) break - except openai.error.RateLimitError as e: + except RateLimitError as e: if count >= 60: raise e if count == 1: @@ -174,14 +164,14 @@ def _encode( time.sleep(10.05) else: time.sleep(1) - embeddings += [entry["embedding"] for entry in response["data"]] + embeddings += [entry.embedding for entry in response.data] else: - response = openai.Embedding.create( - input=documents_batch, engine=self.model_name + response = self.openai_client.embeddings.create( + input=documents_batch, model=self.model_name ) - embeddings = [entry["embedding"] for entry in response["data"]] + embeddings = [entry.embedding for entry in response.data] yield embeddings - except openai_error.AuthenticationError: + except AuthenticationError: raise Exception( "OpenAI API key is invalid. Please provide a valid API key in the constructor of OpenAISentenceEmbedder." ) From 71840fe9c828d301032ad0e96ea361f394996735 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Mon, 28 Jul 2025 23:53:21 +0200 Subject: [PATCH 07/12] perf: openai alignment --- src/embedders/classification/contextual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index bba8aad..da66a4e 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -37,7 +37,7 @@ def __init__(self, config_string: str, batch_size: int = 128): @staticmethod def load(embedder: dict) -> "HuggingFaceSentenceEmbedder": return HuggingFaceSentenceEmbedder( - config_string=request_util.get_model_path(embedder["model_name"]), + config_string=request_util.get_model_path(embedder["config_string"]), batch_size=embedder["batch_size"], ) From bba274cf47cfa7d6f6c157f013bd49e46635bb9e Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Mon, 28 Jul 2025 23:53:37 +0200 Subject: [PATCH 08/12] perf: openai alignment --- controller.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/controller.py b/controller.py index d25551f..cdf3bc1 100644 --- a/controller.py +++ b/controller.py @@ -12,11 +12,13 @@ import zlib import gc import os -import openai import pandas as pd import shutil +from openai import APIConnectionError from src.embedders import Transformer, util + +# Embedder imports are used by eval(Embedder) in __setup_tmp_embedder from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, @@ -350,7 +352,7 @@ def run_encoding( enums.EmbeddingState.ENCODING.value, initial_count, ) - except openai.error.APIConnectionError as e: + except APIConnectionError as e: embedding.update_embedding_state_failed( project_id, embedding_id, From d3e42344b7f87107eeeeaae2ff36680ec97fae78 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 29 Jul 2025 17:42:52 +0200 Subject: [PATCH 09/12] perf: hf caching + openai alignment --- controller.py | 6 ++++++ src/embedders/classification/contextual.py | 14 ++++---------- src/embedders/util.py | 4 +--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/controller.py b/controller.py index cdf3bc1..889168f 100644 --- a/controller.py +++ b/controller.py @@ -3,6 +3,7 @@ from fastapi import status from spacy.tokens import DocBin, Doc from spacy.vocab import Vocab +from functools import lru_cache import json import torch @@ -636,6 +637,11 @@ def __setup_tmp_embedder(project_id: str, embedder_id: str) -> Transformer: embedder_path = util.INFERENCE_DIR / project_id / embedder_id / "embedder.json" if not embedder_path.exists(): raise Exception(f"Embedder {embedder_id} not found") + return __load_embedder_by_path(embedder_path) + + +@lru_cache(maxsize=32) +def __load_embedder_by_path(embedder_path: str) -> Transformer: with open(embedder_path, "r") as f: embedder = json.load(f) Embedder = eval(embedder["cls"]) diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index da66a4e..726e8a4 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -44,7 +44,7 @@ def load(embedder: dict) -> "HuggingFaceSentenceEmbedder": def to_json(self) -> dict: return { "cls": "HuggingFaceSentenceEmbedder", - "model_name": self.model.model_card_data.base_model, + "config_string": self.model.model_card_data.base_model, "batch_size": self.batch_size, } @@ -121,19 +121,13 @@ def __init__( and api_version is not None and api_base is not None ), "If you want to use Azure, you need to provide api_type, api_version and api_base." - - @property - def openai_client(self): - if self.use_azure: - return AzureOpenAI( + self.openai_client = AzureOpenAI( api_key=self.openai_api_key, azure_endpoint=self.api_base, api_version=self.api_version, ) - return OpenAI( - api_key=self.openai_api_key, - base_url=self.api_base, - ) + else: + self.openai_client = OpenAI(api_key=self.openai_api_key) def _encode( self, documents: List[Union[str, Doc]], fit_model: bool diff --git a/src/embedders/util.py b/src/embedders/util.py index 81feacf..cfa6207 100644 --- a/src/embedders/util.py +++ b/src/embedders/util.py @@ -5,9 +5,7 @@ import json import pickle -INFERENCE_DIR = Path( - os.getenv("INFERENCE_DIR", "/Users/andhrelja/Projects/dev-setup/inference") -) +INFERENCE_DIR = Path(os.getenv("INFERENCE_DIR", "/inference")) def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, None]: From 5524dc32ddd5bd4f1fd9e10a7e4669c5e3eed6a1 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 30 Jul 2025 09:42:36 +0200 Subject: [PATCH 10/12] feat: remove count-based embedders --- Dockerfile | 4 - controller.py | 7 -- dev.Dockerfile | 4 - gpu.Dockerfile | 4 - src/embedders/classification/count_based.py | 107 -------------------- src/embedders/extraction/count_based.py | 47 --------- 6 files changed, 173 deletions(-) delete mode 100644 src/embedders/classification/count_based.py delete mode 100644 src/embedders/extraction/count_based.py diff --git a/Dockerfile b/Dockerfile index 67f0937..a9c4c0f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,10 +4,6 @@ WORKDIR /program COPY requirements.txt . -RUN apt-get update && apt-get install -y --no-install-recommends \ - git \ - && rm -rf /var/lib/apt/lists/* - RUN pip3 install --no-cache-dir -r requirements.txt COPY / . diff --git a/controller.py b/controller.py index 889168f..6be1953 100644 --- a/controller.py +++ b/controller.py @@ -24,11 +24,6 @@ OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, ) -from src.embedders.classification.count_based import ( - BagOfCharsSentenceEmbedder, - BagOfWordsSentenceEmbedder, - TfidfSentenceEmbedder, -) from src.embedders.classification.reduce import PCASentenceReducer from src.util import daemon, request_util from src.util.decorator import param_throttle @@ -421,8 +416,6 @@ def run_encoding( notification_message = "Access denied due to invalid api key." elif platform == enums.EmbeddingPlatform.AZURE.value: notification_message = "Access denied due to invalid subscription key or wrong endpoint data." - elif error_message == "invalid api token": - notification_message = "Access denied due to invalid api token." notification.create( project_id, user_id, diff --git a/dev.Dockerfile b/dev.Dockerfile index fdd83d3..040d52b 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -6,10 +6,6 @@ VOLUME ["/app"] COPY requirements*.txt . -RUN apt-get update && apt-get install -y --no-install-recommends \ - git \ - && rm -rf /var/lib/apt/lists/* - RUN pip3 install --no-cache-dir -r requirements-dev.txt COPY / . diff --git a/gpu.Dockerfile b/gpu.Dockerfile index f6834f9..c7a310e 100644 --- a/gpu.Dockerfile +++ b/gpu.Dockerfile @@ -4,10 +4,6 @@ WORKDIR /program COPY gpu-requirements.txt . -RUN apt-get update && apt-get install -y --no-install-recommends \ - git \ - && rm -rf /var/lib/apt/lists/* - RUN pip3 install --no-cache-dir -r gpu-requirements.txt COPY / . diff --git a/src/embedders/classification/count_based.py b/src/embedders/classification/count_based.py deleted file mode 100644 index e80738a..0000000 --- a/src/embedders/classification/count_based.py +++ /dev/null @@ -1,107 +0,0 @@ -from typing import List, Union, Generator -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfVectorizer -from src.embedders.classification import SentenceEmbedder -from src.embedders import util - - -class CountSentenceEmbedder(SentenceEmbedder): - def __init__(self, batch_size: int, min_df: float, **kwargs): - super().__init__(batch_size) - - def _encode( - self, documents: List[str], fit_model: bool - ) -> Generator[List[List[Union[float, int]]], None, None]: - if fit_model: - self.model.fit(documents) - - for documents_batch in util.batch(documents, self.batch_size): - documents_batch_embedded = [] - for doc in documents_batch: - documents_batch_embedded.append( - self.model.transform([doc]).toarray().tolist()[0] - ) - yield documents_batch_embedded - - @classmethod - def load(cls, embedder: dict) -> "CountSentenceEmbedder": - model = util.read_pickle(embedder["model_pkl"]) - return cls( - model=model, - batch_size=embedder["batch_size"], - min_df=embedder["min_df"], - ) - - def to_json(self) -> dict: - return { - "cls": self.__class__.__name__, - "batch_size": self.batch_size, - "min_df": self.model.min_df, - } - - def dump(self, project_id: str, embedding_id: str) -> None: - export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" - export_file.parent.mkdir(parents=True, exist_ok=True) - pkl_file = util.INFERENCE_DIR / project_id / embedding_id / "python.pkl" - util.write_pickle(self.model, pkl_file) - - json_obj = self.to_json() - json_obj["model_pkl"] = str(pkl_file) - util.write_json(json_obj, export_file, indent=2) - - -class BagOfCharsSentenceEmbedder(CountSentenceEmbedder): - """Embeds documents using plain Bag of Characters approach. - - Args: - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. - """ - - def __init__( - self, - model: CountVectorizer = None, - batch_size: int = 128, - min_df: float = 0.1, - **kwargs, - ): - super().__init__(batch_size, min_df) - self.model = model or CountVectorizer(analyzer="char", min_df=min_df, **kwargs) - - -class BagOfWordsSentenceEmbedder(CountSentenceEmbedder): - """Embeds documents using plain Bag of Words approach. - - Args: - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. - """ - - def __init__( - self, - model: CountVectorizer = None, - batch_size: int = 128, - min_df: float = 0.1, - **kwargs, - ): - super().__init__(batch_size, min_df) - self.model = model or CountVectorizer(min_df=min_df, **kwargs) - - -class TfidfSentenceEmbedder(CountSentenceEmbedder): - """Embeds documents using Term Frequency - Inverse Document Frequency (TFIDF) approach. - - Args: - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. - """ - - def __init__( - self, - model: CountVectorizer = None, - batch_size: int = 128, - min_df: float = 0.1, - **kwargs, - ): - super().__init__(batch_size, min_df) - self.model = model or TfidfVectorizer(min_df=min_df, **kwargs) diff --git a/src/embedders/extraction/count_based.py b/src/embedders/extraction/count_based.py deleted file mode 100644 index 7c45900..0000000 --- a/src/embedders/extraction/count_based.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import List, Generator, Union -from sklearn.feature_extraction.text import CountVectorizer -from src.embedders import util -from spacy.tokens.doc import Doc - -from src.embedders.extraction import TokenEmbedder - - -class BagOfCharsTokenEmbedder(TokenEmbedder): - """Embeds documents using plain Bag of Characters approach. - - Args: - language_code (str): Name of the spaCy language model - precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False. - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - """ - - def __init__( - self, - language_code: str, - precomputed_docs: bool = False, - batch_size: int = 128, - **kwargs - ): - super().__init__(language_code, precomputed_docs, batch_size) - self.model = CountVectorizer(analyzer="char", min_df=0.01, **kwargs) - - def _encode( - self, documents: List[Union[str, Doc]], fit_model: bool - ) -> Generator[List[List[List[int]]], None, None]: - if fit_model: - if self.preloaded: - self.model.fit([doc.text for doc in documents]) - else: - self.model.fit(documents) - - for documents_batch in util.batch(documents, self.batch_size): - documents_batch_embedded = [] - for doc in documents_batch: - documents_batch_embedded.append( - self.model.transform( - [tok.text for tok in self._get_tokenized_document(doc)] - ) - .toarray() - .tolist() - ) - yield documents_batch_embedded From 514ee8c302e908d6c2d860388cf6f65a382a6c43 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 30 Jul 2025 15:54:03 +0200 Subject: [PATCH 11/12] perf: update dump/load paths perf: use pickle.dumps --- controller.py | 6 +++--- src/embedders/__init__.py | 3 +-- src/embedders/classification/contextual.py | 6 ++++-- src/embedders/classification/reduce.py | 19 +++++++++++-------- src/embedders/util.py | 5 ++--- start | 1 - 6 files changed, 21 insertions(+), 19 deletions(-) diff --git a/controller.py b/controller.py index 6be1953..464faf1 100644 --- a/controller.py +++ b/controller.py @@ -489,8 +489,8 @@ def delete_embedding(project_id: str, embedding_id: str) -> int: org_id = organization.get_id_by_project_id(project_id) s3.delete_object(org_id, f"{project_id}/{object_name}") request_util.delete_embedding_from_neural_search(embedding_id) - json_path = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" - shutil.rmtree(json_path.parent) + json_path = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" + json_path.unlink(missing_ok=True) return status.HTTP_200_OK @@ -627,7 +627,7 @@ def re_embed_records(project_id: str, changes: Dict[str, List[Dict[str, str]]]): def __setup_tmp_embedder(project_id: str, embedder_id: str) -> Transformer: - embedder_path = util.INFERENCE_DIR / project_id / embedder_id / "embedder.json" + embedder_path = util.INFERENCE_DIR / project_id / f"embedder-{embedder_id}.json" if not embedder_path.exists(): raise Exception(f"Embedder {embedder_id} not found") return __load_embedder_by_path(embedder_path) diff --git a/src/embedders/__init__.py b/src/embedders/__init__.py index b0b63cc..e5e2532 100644 --- a/src/embedders/__init__.py +++ b/src/embedders/__init__.py @@ -55,8 +55,7 @@ def load(self, embedder: dict) -> None: """Loads the model configuration and weights from disk. Args: - project_id (str): The ID of the project. - embedding_id (str): The ID of the embedding. + embedder (dict): The dumped model configuration. """ raise NotImplementedError diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index 726e8a4..b32acd9 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -49,7 +49,7 @@ def to_json(self) -> dict: } def dump(self, project_id: str, embedding_id: str) -> None: - export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" export_file.parent.mkdir(parents=True, exist_ok=True) util.write_json(self.to_json(), export_file, indent=2) @@ -176,6 +176,7 @@ def load(embedder: dict) -> "OpenAISentenceEmbedder": model_name=embedder["model_name"], batch_size=embedder["batch_size"], openai_api_key=embedder["openai_api_key"], + # only set for Azure api_base=embedder["api_base"], api_type=embedder["api_type"], api_version=embedder["api_version"], @@ -187,6 +188,7 @@ def to_json(self) -> dict: "model_name": self.model_name, "batch_size": self.batch_size, "openai_api_key": self.openai_api_key, + # only set for Azure "api_base": self.api_base, "api_type": self.api_type, "api_version": self.api_version, @@ -194,6 +196,6 @@ def to_json(self) -> dict: } def dump(self, project_id: str, embedding_id: str) -> None: - export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" export_file.parent.mkdir(parents=True, exist_ok=True) util.write_json(self.to_json(), export_file, indent=2) diff --git a/src/embedders/classification/reduce.py b/src/embedders/classification/reduce.py index c3c4c2e..c44265c 100644 --- a/src/embedders/classification/reduce.py +++ b/src/embedders/classification/reduce.py @@ -1,7 +1,10 @@ from spacy.tokens.doc import Doc from typing import Union, List, Generator import numpy as np +import pickle from src.embedders import PCAReducer, util + +# Embedder imports are used by eval(Embedder) in load methods from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, @@ -57,7 +60,9 @@ def _reduce( @staticmethod def load(embedder: dict) -> "PCASentenceReducer": - reducer = util.read_pickle(embedder["reducer_pkl"]) + reducer = pickle.loads( + embedder["reducer_pkl_bytes"].encode("latin-1") + ) # Decode to latin1 to avoid binary issues in JSON Embedder = eval(embedder["embedder"]["cls"]) return PCASentenceReducer( embedder=Embedder.load(embedder["embedder"]), @@ -68,14 +73,12 @@ def to_json(self) -> dict: return { "cls": "PCASentenceReducer", "embedder": self.embedder.to_json(), + "reducer_pkl_bytes": pickle.dumps(self.reducer).decode( + "latin-1" + ), # Encode to latin1 to avoid binary issues in JSON } def dump(self, project_id: str, embedding_id: str) -> None: - export_file = util.INFERENCE_DIR / project_id / embedding_id / "embedder.json" + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" export_file.parent.mkdir(parents=True, exist_ok=True) - pkl_file = util.INFERENCE_DIR / project_id / embedding_id / "reducer.pkl" - util.write_pickle(self.reducer, pkl_file) - - json_obj = self.to_json() - json_obj["reducer_pkl"] = str(pkl_file) - util.write_json(json_obj, export_file, indent=2) + util.write_json(self.to_json(), export_file, indent=2) diff --git a/src/embedders/util.py b/src/embedders/util.py index cfa6207..2ddecb5 100644 --- a/src/embedders/util.py +++ b/src/embedders/util.py @@ -1,11 +1,10 @@ from typing import Any, Generator, List from pathlib import Path import numpy as np -import os import json import pickle -INFERENCE_DIR = Path(os.getenv("INFERENCE_DIR", "/inference")) +INFERENCE_DIR = Path("/inference") def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, None]: @@ -28,7 +27,7 @@ def write_pickle(obj: Any, file_path: str, **kwargs) -> None: pickle.dump(obj, f, **kwargs) -def read_json(file_path: str) -> Any: +def read_json(file_path: str) -> dict[str, Any]: with open(file_path, "r") as f: return json.load(f) diff --git a/start b/start index 6f83e0f..e803302 100755 --- a/start +++ b/start @@ -68,7 +68,6 @@ docker run -d --rm \ -e MODEL_PROVIDER=http://refinery-model-provider:80 \ -e WS_NOTIFY_ENDPOINT="http://refinery-websocket:8080" \ -e NEURAL_SEARCH=http://refinery-neural-search:80 \ --e INFERENCE_DIR=/inference \ --mount type=bind,source="$(pwd)"/,target=/app \ -v /var/run/docker.sock:/var/run/docker.sock \ -v "$MODEL_DIR":/models \ From 63899452c6b06b902dc78c85ed42d97b99aba664 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 30 Jul 2025 15:55:30 +0200 Subject: [PATCH 12/12] chore: update submodules --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 6ebe710..b41145a 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 6ebe7108363438f80fa44b39d0e6582f38e18b21 +Subproject commit b41145ac4d0284b68c65b88baff034123f5403a5