|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# Vector Search using Azure Cosmos DB API for MongoDB\n", |
| 8 | + "\n", |
| 9 | + "This notebook demonstrates using an Azure OpenAI embedding model to vectorize documents already stored in Azure Cosmos DB API for MongoDB, storing the embedding vectors and the creation of a vector index. Lastly, the notebook will demonstrate how to query the vector index to find similar documents.\n", |
| 10 | + "\n", |
| 11 | + "This lab expects the data that was loaded in Lab 2." |
| 12 | + ] |
| 13 | + }, |
| 14 | + { |
| 15 | + "cell_type": "code", |
| 16 | + "execution_count": null, |
| 17 | + "metadata": {}, |
| 18 | + "outputs": [], |
| 19 | + "source": [ |
| 20 | + "import os\n", |
| 21 | + "import pymongo\n", |
| 22 | + "import time\n", |
| 23 | + "import json\n", |
| 24 | + "from openai import AzureOpenAI\n", |
| 25 | + "from dotenv import load_dotenv\n", |
| 26 | + "from tenacity import retry, wait_random_exponential, stop_after_attempt" |
| 27 | + ] |
| 28 | + }, |
| 29 | + { |
| 30 | + "cell_type": "markdown", |
| 31 | + "metadata": {}, |
| 32 | + "source": [ |
| 33 | + "## Load settings\n", |
| 34 | + "\n", |
| 35 | + "This lab expects the `.env` file that was created in Lab 1 to obtain the connection string for the database.\n", |
| 36 | + "\n", |
| 37 | + "Add the following entries into the `.env` file to support the connection to Azure OpenAI API, replacing the values for `<your key>` and `<your endpoint>` with the values from your Azure OpenAI API resource.\n", |
| 38 | + "\n", |
| 39 | + "```text\n", |
| 40 | + "AOAI_ENDPOINT=\"<your endpoint>\"\n", |
| 41 | + "AOAI_KEY=\"<your key>\"\"\n", |
| 42 | + "```" |
| 43 | + ] |
| 44 | + }, |
| 45 | + { |
| 46 | + "cell_type": "code", |
| 47 | + "execution_count": null, |
| 48 | + "metadata": {}, |
| 49 | + "outputs": [], |
| 50 | + "source": [ |
| 51 | + "load_dotenv()\n", |
| 52 | + "CONNECTION_STRING = os.environ.get(\"DB_CONNECTION_STRING\")\n", |
| 53 | + "EMBEDDINGS_DEPLOYMENT_NAME = \"embeddings\"\n", |
| 54 | + "AOAI_ENDPOINT = os.environ.get(\"AOAI_ENDPOINT\")\n", |
| 55 | + "AOAI_KEY = os.environ.get(\"AOAI_KEY\")\n", |
| 56 | + "AOAI_API_VERSION = \"2023-05-15\"" |
| 57 | + ] |
| 58 | + }, |
| 59 | + { |
| 60 | + "cell_type": "markdown", |
| 61 | + "metadata": {}, |
| 62 | + "source": [ |
| 63 | + "## Establish connectivity to the database" |
| 64 | + ] |
| 65 | + }, |
| 66 | + { |
| 67 | + "cell_type": "code", |
| 68 | + "execution_count": null, |
| 69 | + "metadata": {}, |
| 70 | + "outputs": [], |
| 71 | + "source": [ |
| 72 | + "db_client = pymongo.MongoClient(CONNECTION_STRING)\n", |
| 73 | + "# Create database to hold cosmic works data\n", |
| 74 | + "# MongoDB will create the database if it does not exist\n", |
| 75 | + "db = db_client.cosmic_works" |
| 76 | + ] |
| 77 | + }, |
| 78 | + { |
| 79 | + "cell_type": "markdown", |
| 80 | + "metadata": {}, |
| 81 | + "source": [ |
| 82 | + "## Establish Azure OpenAI connectivity" |
| 83 | + ] |
| 84 | + }, |
| 85 | + { |
| 86 | + "cell_type": "code", |
| 87 | + "execution_count": null, |
| 88 | + "metadata": {}, |
| 89 | + "outputs": [], |
| 90 | + "source": [ |
| 91 | + "ai_client = AzureOpenAI(\n", |
| 92 | + " azure_endpoint = AOAI_ENDPOINT,\n", |
| 93 | + " api_version = AOAI_API_VERSION,\n", |
| 94 | + " api_key = AOAI_KEY\n", |
| 95 | + " )" |
| 96 | + ] |
| 97 | + }, |
| 98 | + { |
| 99 | + "cell_type": "markdown", |
| 100 | + "metadata": {}, |
| 101 | + "source": [ |
| 102 | + "## Vectorize and store the embeddings in each document\n", |
| 103 | + "\n", |
| 104 | + "The process of creating a vector embedding field on each document only needs to be done once. However, if a document changes, the vector embedding field will need to be updated with an updated vector." |
| 105 | + ] |
| 106 | + }, |
| 107 | + { |
| 108 | + "cell_type": "code", |
| 109 | + "execution_count": null, |
| 110 | + "metadata": {}, |
| 111 | + "outputs": [], |
| 112 | + "source": [ |
| 113 | + "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))\n", |
| 114 | + "def generate_embeddings(text: str):\n", |
| 115 | + " '''\n", |
| 116 | + " Generate embeddings from string of text using the deployed Azure OpenAI API embeddings model.\n", |
| 117 | + " This will be used to vectorize document data and incoming user messages for a similarity search with\n", |
| 118 | + " the vector index.\n", |
| 119 | + " '''\n", |
| 120 | + " response = ai_client.embeddings.create(input=text, model=EMBEDDINGS_DEPLOYMENT_NAME)\n", |
| 121 | + " embeddings = response.data[0].embedding\n", |
| 122 | + " time.sleep(0.5) # rest period to avoid rate limiting on AOAI for free tier\n", |
| 123 | + " return embeddings" |
| 124 | + ] |
| 125 | + }, |
| 126 | + { |
| 127 | + "cell_type": "code", |
| 128 | + "execution_count": null, |
| 129 | + "metadata": {}, |
| 130 | + "outputs": [], |
| 131 | + "source": [ |
| 132 | + "# demonstrate embeddings generation using a test string\n", |
| 133 | + "test = \"hello, world\"\n", |
| 134 | + "print(generate_embeddings(test))" |
| 135 | + ] |
| 136 | + }, |
| 137 | + { |
| 138 | + "cell_type": "markdown", |
| 139 | + "metadata": {}, |
| 140 | + "source": [ |
| 141 | + "### Vectorize and update all documents in the Cosmic Works database" |
| 142 | + ] |
| 143 | + }, |
| 144 | + { |
| 145 | + "cell_type": "code", |
| 146 | + "execution_count": null, |
| 147 | + "metadata": {}, |
| 148 | + "outputs": [], |
| 149 | + "source": [ |
| 150 | + "def add_collection_content_vector_field(collection_name: str):\n", |
| 151 | + " '''\n", |
| 152 | + " Add a new field to the collection to hold the vectorized content of each document.\n", |
| 153 | + " '''\n", |
| 154 | + " collection = db[collection_name]\n", |
| 155 | + " bulk_operations = []\n", |
| 156 | + " for doc in collection.find():\n", |
| 157 | + " # remove any previous contentVector embeddings\n", |
| 158 | + " if \"contentVector\" in doc:\n", |
| 159 | + " del doc[\"contentVector\"]\n", |
| 160 | + "\n", |
| 161 | + " # generate embeddings for the document string representation\n", |
| 162 | + " content = json.dumps(doc, default=str)\n", |
| 163 | + " content_vector = generate_embeddings(content) \n", |
| 164 | + " \n", |
| 165 | + " bulk_operations.append(pymongo.UpdateOne(\n", |
| 166 | + " {\"_id\": doc[\"_id\"]},\n", |
| 167 | + " {\"$set\": {\"contentVector\": content_vector}},\n", |
| 168 | + " upsert=True\n", |
| 169 | + " ))\n", |
| 170 | + " # execute bulk operations\n", |
| 171 | + " collection.bulk_write(bulk_operations)" |
| 172 | + ] |
| 173 | + }, |
| 174 | + { |
| 175 | + "cell_type": "code", |
| 176 | + "execution_count": null, |
| 177 | + "metadata": {}, |
| 178 | + "outputs": [], |
| 179 | + "source": [ |
| 180 | + "# Add vector field to products documents - this will take approximately 3-5 minutes due to rate limiting\n", |
| 181 | + "add_collection_content_vector_field(\"products\")" |
| 182 | + ] |
| 183 | + }, |
| 184 | + { |
| 185 | + "cell_type": "code", |
| 186 | + "execution_count": null, |
| 187 | + "metadata": {}, |
| 188 | + "outputs": [], |
| 189 | + "source": [ |
| 190 | + "# Add vector field to customers documents - this will take approximately 1-2 minutes due to rate limiting\n", |
| 191 | + "add_collection_content_vector_field(\"customers\")" |
| 192 | + ] |
| 193 | + }, |
| 194 | + { |
| 195 | + "cell_type": "code", |
| 196 | + "execution_count": null, |
| 197 | + "metadata": {}, |
| 198 | + "outputs": [], |
| 199 | + "source": [ |
| 200 | + "# Add vector field to customers documents - this will take approximately 15-20 minutes due to rate limiting\n", |
| 201 | + "add_collection_content_vector_field(\"sales\")" |
| 202 | + ] |
| 203 | + }, |
| 204 | + { |
| 205 | + "cell_type": "code", |
| 206 | + "execution_count": null, |
| 207 | + "metadata": {}, |
| 208 | + "outputs": [], |
| 209 | + "source": [ |
| 210 | + "# Create the products vector index\n", |
| 211 | + "db.command({\n", |
| 212 | + " 'createIndexes': 'products',\n", |
| 213 | + " 'indexes': [\n", |
| 214 | + " {\n", |
| 215 | + " 'name': 'VectorSearchIndex',\n", |
| 216 | + " 'key': {\n", |
| 217 | + " \"contentVector\": \"cosmosSearch\"\n", |
| 218 | + " },\n", |
| 219 | + " 'cosmosSearchOptions': {\n", |
| 220 | + " 'kind': 'vector-ivf',\n", |
| 221 | + " 'numLists': 1,\n", |
| 222 | + " 'similarity': 'COS',\n", |
| 223 | + " 'dimensions': 1536\n", |
| 224 | + " }\n", |
| 225 | + " }\n", |
| 226 | + " ]\n", |
| 227 | + "})\n", |
| 228 | + "\n", |
| 229 | + "# Create the customers vector index\n", |
| 230 | + "db.command({\n", |
| 231 | + " 'createIndexes': 'customers',\n", |
| 232 | + " 'indexes': [\n", |
| 233 | + " {\n", |
| 234 | + " 'name': 'VectorSearchIndex',\n", |
| 235 | + " 'key': {\n", |
| 236 | + " \"contentVector\": \"cosmosSearch\"\n", |
| 237 | + " },\n", |
| 238 | + " 'cosmosSearchOptions': {\n", |
| 239 | + " 'kind': 'vector-ivf',\n", |
| 240 | + " 'numLists': 1,\n", |
| 241 | + " 'similarity': 'COS',\n", |
| 242 | + " 'dimensions': 1536\n", |
| 243 | + " }\n", |
| 244 | + " }\n", |
| 245 | + " ]\n", |
| 246 | + "})\n", |
| 247 | + "\n", |
| 248 | + "# Create the sales vector index\n", |
| 249 | + "db.command({\n", |
| 250 | + " 'createIndexes': 'sales',\n", |
| 251 | + " 'indexes': [\n", |
| 252 | + " {\n", |
| 253 | + " 'name': 'VectorSearchIndex',\n", |
| 254 | + " 'key': {\n", |
| 255 | + " \"contentVector\": \"cosmosSearch\"\n", |
| 256 | + " },\n", |
| 257 | + " 'cosmosSearchOptions': {\n", |
| 258 | + " 'kind': 'vector-ivf',\n", |
| 259 | + " 'numLists': 1,\n", |
| 260 | + " 'similarity': 'COS',\n", |
| 261 | + " 'dimensions': 1536\n", |
| 262 | + " }\n", |
| 263 | + " }\n", |
| 264 | + " ]\n", |
| 265 | + "})" |
| 266 | + ] |
| 267 | + }, |
| 268 | + { |
| 269 | + "cell_type": "markdown", |
| 270 | + "metadata": {}, |
| 271 | + "source": [ |
| 272 | + "## Use vector search in Azure Cosmos DB API for MongoDB vCore\n", |
| 273 | + "\n", |
| 274 | + "Now that each document has its associated vector embedding and the vector indexes have been created on each collection, we can now use the vector search capabilities of Azure Cosmos DB API for MongoDB vCore." |
| 275 | + ] |
| 276 | + }, |
| 277 | + { |
| 278 | + "cell_type": "code", |
| 279 | + "execution_count": null, |
| 280 | + "metadata": {}, |
| 281 | + "outputs": [], |
| 282 | + "source": [ |
| 283 | + "def vector_search(collection_name, query, num_results=3):\n", |
| 284 | + " \"\"\"\n", |
| 285 | + " Perform a vector search on the specified collection by vectorizing\n", |
| 286 | + " the query and searching the vector index for the most similar documents.\n", |
| 287 | + "\n", |
| 288 | + " returns a list of the top num_results most similar documents\n", |
| 289 | + " \"\"\"\n", |
| 290 | + " collection = db[collection_name]\n", |
| 291 | + " query_embedding = generate_embeddings(query) \n", |
| 292 | + " pipeline = [\n", |
| 293 | + " {\n", |
| 294 | + " '$search': {\n", |
| 295 | + " \"cosmosSearch\": {\n", |
| 296 | + " \"vector\": query_embedding,\n", |
| 297 | + " \"path\": \"contentVector\",\n", |
| 298 | + " \"k\": num_results\n", |
| 299 | + " },\n", |
| 300 | + " \"returnStoredSource\": True }},\n", |
| 301 | + " {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' } }\n", |
| 302 | + " ]\n", |
| 303 | + " results = collection.aggregate(pipeline)\n", |
| 304 | + " return results\n", |
| 305 | + "\n", |
| 306 | + "def print_product_search_result(result):\n", |
| 307 | + " '''\n", |
| 308 | + " Print the search result document in a readable format\n", |
| 309 | + " '''\n", |
| 310 | + " print(f\"Similarity Score: {result['similarityScore']}\") \n", |
| 311 | + " print(f\"Name: {result['document']['name']}\") \n", |
| 312 | + " print(f\"Category: {result['document']['categoryName']}\")\n", |
| 313 | + " print(f\"SKU: {result['document']['categoryName']}\")\n", |
| 314 | + " print(f\"_id: {result['document']['_id']}\\n\")" |
| 315 | + ] |
| 316 | + }, |
| 317 | + { |
| 318 | + "cell_type": "code", |
| 319 | + "execution_count": null, |
| 320 | + "metadata": {}, |
| 321 | + "outputs": [], |
| 322 | + "source": [ |
| 323 | + "query = \"What bikes do you have?\"\n", |
| 324 | + "results = vector_search(\"products\", query, num_results=4)\n", |
| 325 | + "for result in results:\n", |
| 326 | + " print_product_search_result(result) " |
| 327 | + ] |
| 328 | + }, |
| 329 | + { |
| 330 | + "cell_type": "code", |
| 331 | + "execution_count": null, |
| 332 | + "metadata": {}, |
| 333 | + "outputs": [], |
| 334 | + "source": [ |
| 335 | + "query = \"What do you have that is yellow?\"\n", |
| 336 | + "results = vector_search(\"products\", query, num_results=4)\n", |
| 337 | + "for result in results:\n", |
| 338 | + " print_product_search_result(result) " |
| 339 | + ] |
| 340 | + } |
| 341 | + ], |
| 342 | + "metadata": { |
| 343 | + "kernelspec": { |
| 344 | + "display_name": ".venv", |
| 345 | + "language": "python", |
| 346 | + "name": "python3" |
| 347 | + }, |
| 348 | + "language_info": { |
| 349 | + "codemirror_mode": { |
| 350 | + "name": "ipython", |
| 351 | + "version": 3 |
| 352 | + }, |
| 353 | + "file_extension": ".py", |
| 354 | + "mimetype": "text/x-python", |
| 355 | + "name": "python", |
| 356 | + "nbconvert_exporter": "python", |
| 357 | + "pygments_lexer": "ipython3", |
| 358 | + "version": "3.11.5" |
| 359 | + } |
| 360 | + }, |
| 361 | + "nbformat": 4, |
| 362 | + "nbformat_minor": 2 |
| 363 | +} |
0 commit comments