diff --git a/langchain/README.md b/langchain/README.md index bdc3d32..6e2ddc9 100644 --- a/langchain/README.md +++ b/langchain/README.md @@ -1,2 +1,28 @@ # langchain-vectorize +This package contains the LangChain integrations for using Vectorize. + +## Installation and Setup + +Installation of this package: + +```bash +pip install langchain-vectorize +``` + +## Integrations overview + +### Retriever + +See the [LangChain Retriever documentation](https://python.langchain.com/docs/concepts/retrievers/) for more information. +```python +from langchain_vectorize import VectorizeRetriever + +retriever = VectorizeRetriever( + api_token="...", + organization="...", + pipeline_id="...", +) +retriever.invoke("query") +``` +See an example notebook [here](https://github.com/vectorize-io/integrations-python/tree/main/notebooks/langchain_retriever.ipynb). \ No newline at end of file diff --git a/langchain/langchain_vectorize/__init__.py b/langchain/langchain_vectorize/__init__.py index 2351eb0..f1b520b 100644 --- a/langchain/langchain_vectorize/__init__.py +++ b/langchain/langchain_vectorize/__init__.py @@ -1 +1,5 @@ """Vectorize integrations with LangChain.""" + +from langchain_vectorize.retrievers import VectorizeRetriever + +__all__ = ["VectorizeRetriever"] diff --git a/langchain/langchain_vectorize/retrievers.py b/langchain/langchain_vectorize/retrievers.py index 54d1ed7..3559f42 100644 --- a/langchain/langchain_vectorize/retrievers.py +++ b/langchain/langchain_vectorize/retrievers.py @@ -36,7 +36,76 @@ class VectorizeRetriever(BaseRetriever): - """Vectorize retriever.""" + """Vectorize retriever. + + Setup: + Install package ``langchain-vectorize`` + + .. code-block:: bash + + pip install -U langchain-vectorize + + Init args: + api_token: str + The Vectorize API token. + environment: Literal["prod", "dev", "local", "staging"] + The Vectorize API environment. Defaults to "prod". + organization: Optional[str] + The Vectorize organization ID. Defaults to None. + pipeline_id: Optional[str] + The Vectorize pipeline ID. Defaults to None. + num_results: int + Number of documents to return. Defaults to 5. + rerank: bool + Whether to rerank the results. Defaults to False. + metadata_filters: list[dict[str, Any]] + The metadata filters to apply when retrieving the documents. Defaults to []. + + Instantiate: + .. code-block:: python + + from langchain_vectorize import VectorizeRetriever + + retriever = VectorizeRetriever( + api_token="xxxxx", "organization"="1234", "pipeline_id"="5678" + ) + + Usage: + .. code-block:: python + + query = "what year was breath of the wild released?" + retriever.invoke(query) + + Use within a chain: + .. code-block:: python + + from langchain_core.output_parsers import StrOutputParser + from langchain_core.prompts import ChatPromptTemplate + from langchain_core.runnables import RunnablePassthrough + from langchain_openai import ChatOpenAI + + prompt = ChatPromptTemplate.from_template( + \"\"\"Answer the question based only on the context provided. + + Context: {context} + + Question: {question}\"\"\" + ) + + llm = ChatOpenAI(model="gpt-4o") + + def format_docs(docs): + return "\n\n".join(doc.page_content for doc in docs) + + chain = ( + {"context": retriever | format_docs, "question": RunnablePassthrough()} + | prompt + | llm + | StrOutputParser() + ) + + chain.invoke("how many units did breath of the wild sell in 2020") + """ # noqa: D301 api_token: str """The Vectorize API token.""" @@ -146,7 +215,8 @@ def invoke( .. code-block:: python - retriever.invoke("query") + query = "what year was breath of the wild released?" + docs = retriever.invoke(query, num_results=2) """ kwargs = {} if organization: diff --git a/notebooks/langchain_retriever.ipynb b/notebooks/langchain_retriever.ipynb new file mode 100644 index 0000000..32eaa44 --- /dev/null +++ b/notebooks/langchain_retriever.ipynb @@ -0,0 +1,287 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Langchain Vectorize Retriever\n", + "\n", + "This notebook shows how to use the LangChain Vectorize retriever." + ], + "metadata": { + "id": "zvHrM3wa7IE1" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Setup\n", + "\n", + "In the following steps, we'll setup the Vectorize environment and create a RAG pipeline.\n" + ], + "metadata": { + "id": "r-RswOO5o4K_" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Create a Vectorize Account & Get Your Access Token\n", + "\n", + "Sign up for a free Vectorize account [here](https://platform.vectorize.io/)\n", + "Generate an access token in the [Access Token](https://docs.vectorize.io/rag-pipelines/retrieval-endpoint#access-tokens) section\n", + "Gather your organization ID. From the browser url, extract the UUID from the URL after /organization/" + ], + "metadata": { + "id": "FhvmvFKh4Rlh" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Install langchain_vectorize" + ], + "metadata": { + "id": "JdZ5vlzjoDVr" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IJFmtvDLn5R3" + }, + "outputs": [], + "source": [ + "!pip install -qU langchain-vectorize" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Configure token and organization ID\n", + "\n" + ], + "metadata": { + "id": "L2SULMfWpWFX" + } + }, + { + "cell_type": "code", + "source": [ + "import getpass\n", + "import os\n", + "\n", + "VECTORIZE_ORG_ID = getpass.getpass(\"Enter Vectorize organization ID: \")\n", + "VECTORIZE_API_TOKEN = getpass.getpass(\"Enter Vectorize API Token: \")" + ], + "metadata": { + "id": "BnF8KoDZpg2O" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Download a PDF file" + ], + "metadata": { + "id": "Oj10Moznpz67" + } + }, + { + "cell_type": "code", + "source": [ + "! wget \"https://raw.githubusercontent.com/vectorize-io/vectorize-clients/refs/tags/python-0.1.3/tests/python/tests/research.pdf\"" + ], + "metadata": { + "id": "eLbbTPytrgNw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Initialize the vectorize client" + ], + "metadata": { + "id": "7g54J6awtshs" + } + }, + { + "cell_type": "code", + "source": [ + "import vectorize_client as v\n", + "\n", + "api = v.ApiClient(v.Configuration(access_token=VECTORIZE_API_TOKEN))" + ], + "metadata": { + "id": "9Fr4yz5CrFWP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Create a File Upload Source Connector" + ], + "metadata": { + "id": "wPDoeqETxJrS" + } + }, + { + "cell_type": "code", + "source": [ + "import urllib3, json, os\n", + "\n", + "connectors_api = v.ConnectorsApi(api)\n", + "response = connectors_api.create_source_connector(VECTORIZE_ORG_ID, [{\n", + " \"type\": \"FILE_UPLOAD\",\n", + " \"name\": \"From API\"\n", + " }])\n", + "source_connector_id = response.connectors[0].id" + ], + "metadata": { + "id": "9yEARIcFue5N" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Upload the PDF file" + ], + "metadata": { + "id": "yU3lS6dpxZnQ" + } + }, + { + "cell_type": "code", + "source": [ + "file_path = \"research.pdf\"\n", + "\n", + "http = urllib3.PoolManager()\n", + "uploads_api = v.UploadsApi(api)\n", + "metadata = {\"created-from-api\": True}\n", + "\n", + "upload_response = uploads_api.start_file_upload_to_connector(\n", + " VECTORIZE_ORG_ID, source_connector_id, v.StartFileUploadToConnectorRequest(\n", + " name=file_path.split(\"/\")[-1],\n", + " content_type=\"application/pdf\",\n", + " # add additional metadata that will be stored along with each chunk in the vector database\n", + " metadata=json.dumps(metadata))\n", + ")\n", + "\n", + "with open(file_path, \"rb\") as f:\n", + " response = http.request(\"PUT\", upload_response.upload_url, body=f, headers={\"Content-Type\": \"application/pdf\", \"Content-Length\": str(os.path.getsize(file_path))})\n", + "\n", + "if response.status != 200:\n", + " print(\"Upload failed: \", response.data)\n", + "else:\n", + " print(\"Upload successful\")" + ], + "metadata": { + "id": "OIiMIZ8ZxUYF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Connect to the AI Platform and Vector Database" + ], + "metadata": { + "id": "PdJJfOhfxiIo" + } + }, + { + "cell_type": "code", + "source": [ + "ai_platforms = connectors_api.get_ai_platform_connectors(VECTORIZE_ORG_ID)\n", + "builtin_ai_platform = [c.id for c in ai_platforms.ai_platform_connectors if c.type == \"VECTORIZE\"][0]\n", + "\n", + "vector_databases = connectors_api.get_destination_connectors(VECTORIZE_ORG_ID)\n", + "builtin_vector_db = [c.id for c in vector_databases.destination_connectors if c.type == \"VECTORIZE\"][0]" + ], + "metadata": { + "id": "0ZSGhXJfxjBb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Configure and Deploy the Pipeline" + ], + "metadata": { + "id": "JWoL-kqQxs5H" + } + }, + { + "cell_type": "code", + "source": [ + "pipelines = v.PipelinesApi(api)\n", + "response = pipelines.create_pipeline(VECTORIZE_ORG_ID, v.PipelineConfigurationSchema(\n", + " source_connectors=[v.SourceConnectorSchema(id=source_connector_id, type=\"FILE_UPLOAD\", config={})],\n", + " destination_connector=v.DestinationConnectorSchema(id=builtin_vector_db, type=\"VECTORIZE\", config={}),\n", + " ai_platform=v.AIPlatformSchema(id=builtin_ai_platform, type=\"VECTORIZE\", config={}),\n", + " pipeline_name=\"My Pipeline From API\",\n", + " schedule=v.ScheduleSchema(type=\"manual\")\n", + "))\n", + "pipeline_id = response.data.id" + ], + "metadata": { + "id": "hze9vJbQxvqA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Use the LangChain Vectorize retriever" + ], + "metadata": { + "id": "5ULion9wyj6T" + } + }, + { + "cell_type": "code", + "source": [ + "from langchain_vectorize.retrievers import VectorizeRetriever\n", + "\n", + "retriever = VectorizeRetriever(\n", + " api_token=VECTORIZE_API_TOKEN,\n", + " organization=VECTORIZE_ORG_ID,\n", + " pipeline_id=pipeline_id,\n", + ")\n", + "\n", + "retriever.invoke(\"Apple Shareholders equity\", num_results=2)\n" + ], + "metadata": { + "id": "9D-QfiW7yoe0" + }, + "execution_count": null, + "outputs": [] + } + ] +}