diff --git a/controller.py b/controller.py index 260406e..464faf1 100644 --- a/controller.py +++ b/controller.py @@ -3,8 +3,9 @@ from fastapi import status from spacy.tokens import DocBin, Doc from spacy.vocab import Vocab +from functools import lru_cache -import pickle +import json import torch import traceback import logging @@ -12,10 +13,18 @@ import zlib import gc import os -import openai import pandas as pd +import shutil +from openai import APIConnectionError -from src.embedders import Transformer +from src.embedders import Transformer, util + +# Embedder imports are used by eval(Embedder) in __setup_tmp_embedder +from src.embedders.classification.contextual import ( + OpenAISentenceEmbedder, + HuggingFaceSentenceEmbedder, +) +from src.embedders.classification.reduce import PCASentenceReducer from src.util import daemon, request_util from src.util.decorator import param_throttle from src.util.embedders import get_embedder @@ -339,7 +348,7 @@ def run_encoding( enums.EmbeddingState.ENCODING.value, initial_count, ) - except openai.error.APIConnectionError as e: + except APIConnectionError as e: embedding.update_embedding_state_failed( project_id, embedding_id, @@ -407,9 +416,6 @@ def run_encoding( notification_message = "Access denied due to invalid api key." elif platform == enums.EmbeddingPlatform.AZURE.value: notification_message = "Access denied due to invalid subscription key or wrong endpoint data." - elif error_message == "invalid api token": - # cohere - notification_message = "Access denied due to invalid api token." notification.create( project_id, user_id, @@ -453,14 +459,7 @@ def run_encoding( request_util.post_embedding_to_neural_search(project_id, embedding_id) # now always since otherwise record edit wouldn't work for embedded columns - pickle_path = os.path.join( - "/inference", project_id, f"embedder-{embedding_id}.pkl" - ) - if not os.path.exists(pickle_path): - os.makedirs(os.path.dirname(pickle_path), exist_ok=True) - with open(pickle_path, "wb") as f: - pickle.dump(embedder, f) - + embedder.dump(project_id, embedding_id) upload_embedding_as_file(project_id, embedding_id) embedding.update_embedding_state_finished( project_id, @@ -490,9 +489,8 @@ def delete_embedding(project_id: str, embedding_id: str) -> int: org_id = organization.get_id_by_project_id(project_id) s3.delete_object(org_id, f"{project_id}/{object_name}") request_util.delete_embedding_from_neural_search(embedding_id) - pickle_path = os.path.join("/inference", project_id, f"embedder-{embedding_id}.pkl") - if os.path.exists(pickle_path): - os.remove(pickle_path) + json_path = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" + json_path.unlink(missing_ok=True) return status.HTTP_200_OK @@ -629,15 +627,18 @@ def re_embed_records(project_id: str, changes: Dict[str, List[Dict[str, str]]]): def __setup_tmp_embedder(project_id: str, embedder_id: str) -> Transformer: - embedder_path = os.path.join( - "/inference", project_id, f"embedder-{embedder_id}.pkl" - ) - if not os.path.exists(embedder_path): + embedder_path = util.INFERENCE_DIR / project_id / f"embedder-{embedder_id}.json" + if not embedder_path.exists(): raise Exception(f"Embedder {embedder_id} not found") - with open(embedder_path, "rb") as f: - embedder = pickle.load(f) + return __load_embedder_by_path(embedder_path) + - return embedder +@lru_cache(maxsize=32) +def __load_embedder_by_path(embedder_path: str) -> Transformer: + with open(embedder_path, "r") as f: + embedder = json.load(f) + Embedder = eval(embedder["cls"]) + return Embedder.load(embedder) def calc_tensors(project_id: str, embedding_id: str, texts: List[str]) -> List[Any]: diff --git a/gpu-requirements.txt b/gpu-requirements.txt index ec5a3cd..a0755d5 100644 --- a/gpu-requirements.txt +++ b/gpu-requirements.txt @@ -6,12 +6,6 @@ # --extra-index-url https://download.pytorch.org/whl/cu113 -aiohappyeyeballs==2.6.1 - # via aiohttp -aiohttp==3.12.14 - # via openai -aiosignal==1.4.0 - # via aiohttp annotated-types==0.7.0 # via # -r requirements/torch-cuda-requirements.txt @@ -20,6 +14,7 @@ anyio==4.9.0 # via # -r requirements/torch-cuda-requirements.txt # httpx + # openai # starlette argon2-cffi==25.1.0 # via @@ -29,8 +24,6 @@ argon2-cffi-bindings==21.2.0 # via # -r requirements/torch-cuda-requirements.txt # argon2-cffi -attrs==25.3.0 - # via aiohttp blis==0.7.11 # via thinc boto3==1.39.6 @@ -67,8 +60,6 @@ click==8.2.1 # uvicorn cloudpathlib==0.21.1 # via weasel -cohere==5.16.1 - # via -r requirements/gpu-requirements.in confection==0.1.5 # via # thinc @@ -78,20 +69,16 @@ cymem==2.0.11 # preshed # spacy # thinc +distro==1.9.0 + # via openai fastapi==0.116.1 # via -r requirements/torch-cuda-requirements.txt -fastavro==1.11.1 - # via cohere filelock==3.18.0 # via # -r requirements/torch-cuda-requirements.txt # huggingface-hub # torch # transformers -frozenlist==1.7.0 - # via - # aiohttp - # aiosignal fsspec==2025.7.0 # via # -r requirements/torch-cuda-requirements.txt @@ -109,9 +96,7 @@ hf-xet==1.1.5 httpcore==1.0.9 # via httpx httpx==0.28.1 - # via cohere -httpx-sse==0.4.0 - # via cohere + # via openai huggingface-hub==0.33.4 # via # -r requirements/torch-cuda-requirements.txt @@ -124,12 +109,13 @@ idna==3.10 # anyio # httpx # requests - # yarl jinja2==3.1.6 # via # -r requirements/torch-cuda-requirements.txt # spacy # torch +jiter==0.10.0 + # via openai jmespath==1.0.1 # via # -r requirements/torch-cuda-requirements.txt @@ -160,10 +146,6 @@ mpmath==1.3.0 # via # -r requirements/torch-cuda-requirements.txt # sympy -multidict==6.6.3 - # via - # aiohttp - # yarl murmurhash==1.0.13 # via # preshed @@ -185,7 +167,7 @@ numpy==1.23.4 # thinc # torchvision # transformers -openai==0.28.1 +openai==1.97.1 # via -r requirements/gpu-requirements.in packaging==25.0 # via @@ -205,10 +187,6 @@ preshed==3.0.10 # via # spacy # thinc -propcache==0.3.2 - # via - # aiohttp - # yarl psycopg2-binary==2.9.9 # via -r requirements/torch-cuda-requirements.txt pyaml==25.7.0 @@ -226,16 +204,15 @@ pycryptodome==3.23.0 pydantic==2.7.4 # via # -r requirements/torch-cuda-requirements.txt - # cohere # confection # fastapi + # openai # spacy # thinc # weasel pydantic-core==2.18.4 # via # -r requirements/torch-cuda-requirements.txt - # cohere # pydantic pygments==2.19.2 # via rich @@ -261,9 +238,7 @@ regex==2024.11.6 requests==2.32.4 # via # -r requirements/torch-cuda-requirements.txt - # cohere # huggingface-hub - # openai # spacy # transformers # weasel @@ -304,6 +279,7 @@ sniffio==1.3.1 # via # -r requirements/torch-cuda-requirements.txt # anyio + # openai spacy==3.7.5 # via -r requirements/gpu-requirements.in spacy-legacy==3.0.12 @@ -335,7 +311,6 @@ threadpoolctl==3.6.0 tokenizers==0.21.2 # via # -r requirements/torch-cuda-requirements.txt - # cohere # transformers torch==2.7.1 # via @@ -360,17 +335,14 @@ typer==0.16.0 # via # spacy # weasel -types-requests==2.32.4.20250611 - # via cohere typing-extensions==4.14.1 # via # -r requirements/torch-cuda-requirements.txt - # aiosignal # anyio - # cohere # fastapi # huggingface-hub # minio + # openai # pydantic # pydantic-core # sentence-transformers @@ -383,7 +355,6 @@ urllib3==2.5.0 # botocore # minio # requests - # types-requests uvicorn==0.35.0 # via -r requirements/torch-cuda-requirements.txt wasabi==1.1.3 @@ -395,8 +366,6 @@ weasel==0.4.1 # via spacy wrapt==1.17.2 # via smart-open -yarl==1.20.1 - # via aiohttp # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/requirements.txt b/requirements.txt index a76401b..9bc9341 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,38 +6,31 @@ # --extra-index-url https://download.pytorch.org/whl/cpu -aiohappyeyeballs==2.6.1 - # via aiohttp -aiohttp==3.12.14 - # via openai -aiosignal==1.4.0 - # via aiohttp annotated-types==0.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # pydantic anyio==4.9.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # httpx + # openai # starlette argon2-cffi==25.1.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # minio argon2-cffi-bindings==21.2.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # argon2-cffi -attrs==25.3.0 - # via aiohttp blis==0.7.11 # via thinc boto3==1.39.6 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt botocore==1.39.10 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # boto3 # s3transfer catalogue==2.0.10 @@ -47,28 +40,26 @@ catalogue==2.0.10 # thinc certifi==2025.7.14 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # httpcore # httpx # minio # requests cffi==1.17.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # argon2-cffi-bindings charset-normalizer==3.4.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # requests click==8.2.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # typer # uvicorn cloudpathlib==0.21.1 # via weasel -cohere==5.16.1 - # via -r requirements/requirements.in confection==0.1.5 # via # thinc @@ -78,66 +69,61 @@ cymem==2.0.11 # preshed # spacy # thinc +distro==1.9.0 + # via openai fastapi==0.116.1 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt -fastavro==1.11.1 - # via cohere + # via -r requirements/torch-cpu-requirements.txt filelock==3.18.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # torch # transformers -frozenlist==1.7.0 - # via - # aiohttp - # aiosignal fsspec==2025.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # torch h11==0.16.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # httpcore # uvicorn hf-xet==1.1.5 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub httpcore==1.0.9 # via httpx httpx==0.28.1 - # via cohere -httpx-sse==0.4.0 - # via cohere + # via openai huggingface-hub==0.33.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sentence-transformers # tokenizers # transformers idna==3.10 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # anyio # httpx # requests - # yarl jinja2==3.1.6 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # spacy # torch +jiter==0.10.0 + # via openai jmespath==1.0.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # boto3 # botocore joblib==1.5.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize langcodes==3.5.0 @@ -150,20 +136,16 @@ markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # jinja2 mdurl==0.1.2 # via markdown-it-py minio==7.2.15 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt mpmath==1.3.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sympy -multidict==6.6.3 - # via - # aiohttp - # yarl murmurhash==1.0.13 # via # preshed @@ -171,11 +153,11 @@ murmurhash==1.0.13 # thinc networkx==3.5 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # torch numpy==1.23.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # blis # pandas # scikit-learn @@ -185,18 +167,18 @@ numpy==1.23.4 # thinc # torchvision # transformers -openai==0.28.1 +openai==1.97.1 # via -r requirements/requirements.in packaging==25.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # spacy # thinc # transformers # weasel pandas==1.5.1 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt pillow==11.3.0 # via # sentence-transformers @@ -205,88 +187,81 @@ preshed==3.0.10 # via # spacy # thinc -propcache==0.3.2 - # via - # aiohttp - # yarl psycopg2-binary==2.9.9 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt pyaml==25.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-optimize pycparser==2.22 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # cffi pycryptodome==3.23.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # minio pydantic==2.7.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r requirements/torch-cpu-requirements.txt # confection # fastapi + # openai # spacy # thinc # weasel pydantic-core==2.18.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r requirements/torch-cpu-requirements.txt # pydantic pygments==2.19.2 # via rich python-dateutil==2.9.0.post0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # botocore # pandas pytz==2025.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # pandas pyyaml==6.0.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # pyaml # transformers regex==2024.11.6 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # transformers requests==2.32.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r requirements/torch-cpu-requirements.txt # huggingface-hub - # openai # spacy # transformers # weasel -rich==14.0.0 +rich==14.1.0 # via typer s3transfer==0.13.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # boto3 safetensors==0.5.3 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # transformers scikit-learn==1.5.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt scipy==1.13.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize # sentence-transformers @@ -296,14 +271,15 @@ shellingham==1.5.4 # via typer six==1.17.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # python-dateutil smart-open==7.3.0.post1 # via weasel sniffio==1.3.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # anyio + # openai spacy==3.7.5 # via -r requirements/requirements.in spacy-legacy==3.0.12 @@ -311,7 +287,7 @@ spacy-legacy==3.0.12 spacy-loggers==1.0.5 # via spacy sqlalchemy==1.4.42 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt srsly==2.5.1 # via # confection @@ -320,33 +296,32 @@ srsly==2.5.1 # weasel starlette==0.47.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # fastapi sympy==1.14.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # torch thinc==8.2.5 # via spacy threadpoolctl==3.6.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # scikit-learn tokenizers==0.21.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r requirements/torch-cpu-requirements.txt # transformers torch==2.7.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sentence-transformers # torchvision torchvision==0.22.1 # via -r requirements/requirements.in tqdm==4.67.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # huggingface-hub # openai # sentence-transformers @@ -354,23 +329,20 @@ tqdm==4.67.1 # transformers transformers==4.53.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # sentence-transformers typer==0.16.0 # via # spacy # weasel -types-requests==2.32.4.20250611 - # via cohere typing-extensions==4.14.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # aiosignal + # -r requirements/torch-cpu-requirements.txt # anyio - # cohere # fastapi # huggingface-hub # minio + # openai # pydantic # pydantic-core # sentence-transformers @@ -379,13 +351,12 @@ typing-extensions==4.14.1 # typer urllib3==2.5.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r requirements/torch-cpu-requirements.txt # botocore # minio # requests - # types-requests uvicorn==0.35.0 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r requirements/torch-cpu-requirements.txt wasabi==1.1.3 # via # spacy @@ -395,8 +366,6 @@ weasel==0.4.1 # via spacy wrapt==1.17.2 # via smart-open -yarl==1.20.1 - # via aiohttp # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/requirements/gpu-requirements.in b/requirements/gpu-requirements.in index 85e6908..50ca41a 100644 --- a/requirements/gpu-requirements.in +++ b/requirements/gpu-requirements.in @@ -2,5 +2,4 @@ spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -cohere==5.16.1 \ No newline at end of file +openai==1.97.1 \ No newline at end of file diff --git a/requirements/requirements.in b/requirements/requirements.in index 15c7933..6f337ed 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -2,5 +2,4 @@ spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -cohere==5.16.1 \ No newline at end of file +openai==1.97.1 \ No newline at end of file diff --git a/src/embedders/__init__.py b/src/embedders/__init__.py index 7375321..e5e2532 100644 --- a/src/embedders/__init__.py +++ b/src/embedders/__init__.py @@ -50,6 +50,30 @@ def get_warnings(self) -> Dict: """ pass + @abstractmethod + def load(self, embedder: dict) -> None: + """Loads the model configuration and weights from disk. + + Args: + embedder (dict): The dumped model configuration. + """ + raise NotImplementedError + + @abstractmethod + def dump(self, project_id: str, embedding_id: str) -> None: + """Dumps the model configuration and weights to disk. + + Args: + project_id (str): The ID of the project. + embedding_id (str): The ID of the embedding. + """ + raise NotImplementedError + + @abstractmethod + def to_json(self) -> dict: + """Converts the embedder to a JSON serializable dictionary.""" + raise NotImplementedError + class Embedder(Transformer, metaclass=ABCMeta): def __init__(self): @@ -112,11 +136,12 @@ def __init__( embedder: Embedder, n_components: int = 8, autocorrect_n_components: bool = True, + reducer: PCA = None, **kwargs, ): super().__init__() self.embedder = embedder - self.reducer = PCA(n_components=n_components, **kwargs) + self.reducer = reducer or PCA(n_components=n_components, **kwargs) self.batch_size = self.embedder.batch_size self.autocorrect_n_components = autocorrect_n_components diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index 0a46c0f..b32acd9 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -2,11 +2,11 @@ from sentence_transformers import SentenceTransformer from src.embedders import util from src.embedders.classification import SentenceEmbedder +from src.util import request_util from spacy.tokens.doc import Doc import torch -import openai -from openai import error as openai_error -import cohere +from openai import OpenAI, AzureOpenAI +from openai import AuthenticationError, RateLimitError import time @@ -34,6 +34,25 @@ class HuggingFaceSentenceEmbedder(TransformerSentenceEmbedder): def __init__(self, config_string: str, batch_size: int = 128): super().__init__(config_string, batch_size) + @staticmethod + def load(embedder: dict) -> "HuggingFaceSentenceEmbedder": + return HuggingFaceSentenceEmbedder( + config_string=request_util.get_model_path(embedder["config_string"]), + batch_size=embedder["batch_size"], + ) + + def to_json(self) -> dict: + return { + "cls": "HuggingFaceSentenceEmbedder", + "config_string": self.model.model_card_data.base_model, + "batch_size": self.batch_size, + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + util.write_json(self.to_json(), export_file, indent=2) + class OpenAISentenceEmbedder(SentenceEmbedder): def __init__( @@ -85,7 +104,6 @@ def __init__( super().__init__(batch_size) self.model_name = model_name self.openai_api_key = openai_api_key - openai.api_key = self.openai_api_key self.api_base = api_base self.api_type = api_type self.api_version = api_version @@ -103,28 +121,13 @@ def __init__( and api_version is not None and api_base is not None ), "If you want to use Azure, you need to provide api_type, api_version and api_base." - - openai.api_base = api_base - openai.api_type = api_type - openai.api_version = api_version - - def __getstate__(self): - state = self.__dict__.copy() - return state - - def __setstate__(self, state): - self.__dict__.update(state) - self.model_name = state["model_name"] - self.openai_api_key = state["openai_api_key"] - openai.api_key = self.openai_api_key - self.use_azure = state.get("use_azure") - if self.use_azure: - self.api_base = state["api_base"] - self.api_type = state["api_type"] - self.api_version = state["api_version"] - openai.api_base = self.api_base - openai.api_type = self.api_type - openai.api_version = self.api_version + self.openai_client = AzureOpenAI( + api_key=self.openai_api_key, + azure_endpoint=self.api_base, + api_version=self.api_version, + ) + else: + self.openai_client = OpenAI(api_key=self.openai_api_key) def _encode( self, documents: List[Union[str, Doc]], fit_model: bool @@ -140,11 +143,11 @@ def _encode( while True and count < 60: try: count += 1 - response = openai.Embedding.create( - input=azure_batch, engine=self.model_name + response = self.openai_client.embeddings.create( + input=azure_batch, model=self.model_name ) break - except openai.error.RateLimitError as e: + except RateLimitError as e: if count >= 60: raise e if count == 1: @@ -155,39 +158,44 @@ def _encode( time.sleep(10.05) else: time.sleep(1) - embeddings += [entry["embedding"] for entry in response["data"]] + embeddings += [entry.embedding for entry in response.data] else: - response = openai.Embedding.create( - input=documents_batch, engine=self.model_name + response = self.openai_client.embeddings.create( + input=documents_batch, model=self.model_name ) - embeddings = [entry["embedding"] for entry in response["data"]] + embeddings = [entry.embedding for entry in response.data] yield embeddings - except openai_error.AuthenticationError: + except AuthenticationError: raise Exception( "OpenAI API key is invalid. Please provide a valid API key in the constructor of OpenAISentenceEmbedder." ) + @staticmethod + def load(embedder: dict) -> "OpenAISentenceEmbedder": + return OpenAISentenceEmbedder( + model_name=embedder["model_name"], + batch_size=embedder["batch_size"], + openai_api_key=embedder["openai_api_key"], + # only set for Azure + api_base=embedder["api_base"], + api_type=embedder["api_type"], + api_version=embedder["api_version"], + ) -class CohereSentenceEmbedder(SentenceEmbedder): - def __init__(self, cohere_api_key: str, batch_size: int = 128): - super().__init__(batch_size) - self.cohere_api_key = cohere_api_key - self.model = cohere.Client(self.cohere_api_key) - - def __getstate__(self): - state = self.__dict__.copy() - # Don't pickle 'model' - del state["model"] - return state - - def __setstate__(self, state): - self.__dict__.update(state) - # Restore 'model' after unpickling - self.model = cohere.Client(self.cohere_api_key) - - def _encode( - self, documents: List[Union[str, Doc]], fit_model: bool - ) -> Generator[List[List[float]], None, None]: - for documents_batch in util.batch(documents, self.batch_size): - embeddings = self.model.embed(documents_batch).embeddings - yield embeddings + def to_json(self) -> dict: + return { + "cls": "OpenAISentenceEmbedder", + "model_name": self.model_name, + "batch_size": self.batch_size, + "openai_api_key": self.openai_api_key, + # only set for Azure + "api_base": self.api_base, + "api_type": self.api_type, + "api_version": self.api_version, + "use_azure": self.use_azure, + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + util.write_json(self.to_json(), export_file, indent=2) diff --git a/src/embedders/classification/count_based.py b/src/embedders/classification/count_based.py deleted file mode 100644 index a9f53d3..0000000 --- a/src/embedders/classification/count_based.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import List, Union, Generator -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfVectorizer -from src.embedders.classification import SentenceEmbedder -from src.embedders import util - - -class CountSentenceEmbedder(SentenceEmbedder): - def __init__(self, batch_size: int, min_df: float, **kwargs): - super().__init__(batch_size) - - def _encode( - self, documents: List[str], fit_model: bool - ) -> Generator[List[List[Union[float, int]]], None, None]: - if fit_model: - self.model.fit(documents) - - for documents_batch in util.batch(documents, self.batch_size): - documents_batch_embedded = [] - for doc in documents_batch: - documents_batch_embedded.append( - self.model.transform([doc]).toarray().tolist()[0] - ) - yield documents_batch_embedded - - -class BagOfCharsSentenceEmbedder(CountSentenceEmbedder): - """Embeds documents using plain Bag of Characters approach. - - Args: - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. - """ - - def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): - super().__init__(batch_size, min_df) - self.model = CountVectorizer(analyzer="char", min_df=min_df, **kwargs) - - -class BagOfWordsSentenceEmbedder(CountSentenceEmbedder): - """Embeds documents using plain Bag of Words approach. - - Args: - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. - """ - - def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): - super().__init__(batch_size, min_df) - self.model = CountVectorizer(min_df=min_df, **kwargs) - - -class TfidfSentenceEmbedder(CountSentenceEmbedder): - """Embeds documents using Term Frequency - Inverse Document Frequency (TFIDF) approach. - - Args: - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. - """ - - def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): - super().__init__(batch_size, min_df) - self.model = TfidfVectorizer(min_df=min_df, **kwargs) diff --git a/src/embedders/classification/reduce.py b/src/embedders/classification/reduce.py index 5bc226a..c44265c 100644 --- a/src/embedders/classification/reduce.py +++ b/src/embedders/classification/reduce.py @@ -1,8 +1,15 @@ from spacy.tokens.doc import Doc from typing import Union, List, Generator import numpy as np +import pickle from src.embedders import PCAReducer, util +# Embedder imports are used by eval(Embedder) in load methods +from src.embedders.classification.contextual import ( + OpenAISentenceEmbedder, + HuggingFaceSentenceEmbedder, +) + class PCASentenceReducer(PCAReducer): def _transform( @@ -50,3 +57,28 @@ def _reduce( else: embeddings = self.embedder.transform(documents) yield self._transform(embeddings) + + @staticmethod + def load(embedder: dict) -> "PCASentenceReducer": + reducer = pickle.loads( + embedder["reducer_pkl_bytes"].encode("latin-1") + ) # Decode to latin1 to avoid binary issues in JSON + Embedder = eval(embedder["embedder"]["cls"]) + return PCASentenceReducer( + embedder=Embedder.load(embedder["embedder"]), + reducer=reducer, + ) + + def to_json(self) -> dict: + return { + "cls": "PCASentenceReducer", + "embedder": self.embedder.to_json(), + "reducer_pkl_bytes": pickle.dumps(self.reducer).decode( + "latin-1" + ), # Encode to latin1 to avoid binary issues in JSON + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + util.write_json(self.to_json(), export_file, indent=2) diff --git a/src/embedders/extraction/count_based.py b/src/embedders/extraction/count_based.py deleted file mode 100644 index 7c45900..0000000 --- a/src/embedders/extraction/count_based.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import List, Generator, Union -from sklearn.feature_extraction.text import CountVectorizer -from src.embedders import util -from spacy.tokens.doc import Doc - -from src.embedders.extraction import TokenEmbedder - - -class BagOfCharsTokenEmbedder(TokenEmbedder): - """Embeds documents using plain Bag of Characters approach. - - Args: - language_code (str): Name of the spaCy language model - precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False. - batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. - """ - - def __init__( - self, - language_code: str, - precomputed_docs: bool = False, - batch_size: int = 128, - **kwargs - ): - super().__init__(language_code, precomputed_docs, batch_size) - self.model = CountVectorizer(analyzer="char", min_df=0.01, **kwargs) - - def _encode( - self, documents: List[Union[str, Doc]], fit_model: bool - ) -> Generator[List[List[List[int]]], None, None]: - if fit_model: - if self.preloaded: - self.model.fit([doc.text for doc in documents]) - else: - self.model.fit(documents) - - for documents_batch in util.batch(documents, self.batch_size): - documents_batch_embedded = [] - for doc in documents_batch: - documents_batch_embedded.append( - self.model.transform( - [tok.text for tok in self._get_tokenized_document(doc)] - ) - .toarray() - .tolist() - ) - yield documents_batch_embedded diff --git a/src/embedders/util.py b/src/embedders/util.py index 4b8c1b2..2ddecb5 100644 --- a/src/embedders/util.py +++ b/src/embedders/util.py @@ -1,5 +1,10 @@ from typing import Any, Generator, List +from pathlib import Path import numpy as np +import json +import pickle + +INFERENCE_DIR = Path("/inference") def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, None]: @@ -10,3 +15,23 @@ def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, N def num_batches(documents: List[Any], batch_size: int) -> int: return int(np.ceil(len(documents) / batch_size)) + + +def read_pickle(file_path: str) -> Any: + with open(file_path, "rb") as f: + return pickle.load(f) + + +def write_pickle(obj: Any, file_path: str, **kwargs) -> None: + with open(file_path, "wb") as f: + pickle.dump(obj, f, **kwargs) + + +def read_json(file_path: str) -> dict[str, Any]: + with open(file_path, "r") as f: + return json.load(f) + + +def write_json(obj: Any, file_path: str, **kwargs) -> None: + with open(file_path, "w") as f: + json.dump(obj, f, **kwargs) diff --git a/src/util/embedders.py b/src/util/embedders.py index cd3d41a..81d1c40 100644 --- a/src/util/embedders.py +++ b/src/util/embedders.py @@ -1,15 +1,8 @@ from typing import Optional -from src.embedders.classification.count_based import ( - BagOfCharsSentenceEmbedder, - BagOfWordsSentenceEmbedder, - TfidfSentenceEmbedder, -) from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, - CohereSentenceEmbedder, ) -from src.embedders.extraction.count_based import BagOfCharsTokenEmbedder from src.embedders.extraction.contextual import TransformerTokenEmbedder from src.embedders.classification.reduce import PCASentenceReducer from src.embedders.extraction.reduce import PCATokenReducer @@ -31,16 +24,7 @@ def get_embedder( if embedding_type == enums.EmbeddingType.ON_ATTRIBUTE.value: batch_size = 128 n_components = 64 - if platform == enums.EmbeddingPlatform.PYTHON.value: - if model == "bag-of-characters": - return BagOfCharsSentenceEmbedder(batch_size=batch_size) - elif model == "bag-of-words": - embedder = BagOfWordsSentenceEmbedder(batch_size=batch_size) - elif model == "tf-idf": - return TfidfSentenceEmbedder(batch_size=batch_size, min_df=0) - else: - raise Exception(f"Unknown model {model}") - elif ( + if ( platform == enums.EmbeddingPlatform.OPENAI.value or platform == enums.EmbeddingPlatform.AZURE.value ): @@ -58,10 +42,6 @@ def get_embedder( embedder = HuggingFaceSentenceEmbedder( config_string=model, batch_size=batch_size ) - elif platform == enums.EmbeddingPlatform.COHERE.value: - embedder = CohereSentenceEmbedder( - cohere_api_key=api_token, batch_size=batch_size - ) else: raise Exception(f"Unknown platform {platform}") @@ -74,23 +54,12 @@ def get_embedder( else: # extraction batch_size = 32 n_components = 16 - if model == "bag-of-characters": - return BagOfCharsTokenEmbedder( + return PCATokenReducer( + TransformerTokenEmbedder( + config_string=model, language_code=language_code, precomputed_docs=True, batch_size=batch_size, - ) - if model == "bag-of-words": - return None - if model == "tf-idf": - return None - else: - return PCATokenReducer( - TransformerTokenEmbedder( - config_string=model, - language_code=language_code, - precomputed_docs=True, - batch_size=batch_size, - ), - n_components=n_components, - ) + ), + n_components=n_components, + ) diff --git a/submodules/model b/submodules/model index c5663e9..b41145a 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit c5663e9c855c30b431d106811a8330301c84eb24 +Subproject commit b41145ac4d0284b68c65b88baff034123f5403a5