hummusonrails
diff --git a/‎README.md‎
Lines changed: 105 additions & 2 deletions b/‎README.md‎
Lines changed: 105 additions & 2 deletions
diff --git a/‎config_file/config‎
Lines changed: 26 additions & 0 deletions b/‎config_file/config‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎data/10_kamino.json‎ ‎data/individual_items/10_kamino.json‎data/10_kamino.json renamed to data/individual_items/10_kamino.json b/‎data/10_kamino.json‎ ‎data/individual_items/10_kamino.json‎data/10_kamino.json renamed to data/individual_items/10_kamino.json
diff --git a/‎data/10_obi_wan_kenobi.json‎ ‎…/individual_items/10_obi_wan_kenobi.json‎data/10_obi_wan_kenobi.json renamed to data/individual_items/10_obi_wan_kenobi.json b/‎data/10_obi_wan_kenobi.json‎ ‎…/individual_items/10_obi_wan_kenobi.json‎data/10_obi_wan_kenobi.json renamed to data/individual_items/10_obi_wan_kenobi.json
diff --git a/‎data/10_rebel_transport.json‎ ‎…individual_items/10_rebel_transport.json‎data/10_rebel_transport.json renamed to data/individual_items/10_rebel_transport.json b/‎data/10_rebel_transport.json‎ ‎…individual_items/10_rebel_transport.json‎data/10_rebel_transport.json renamed to data/individual_items/10_rebel_transport.json
diff --git a/‎data/1_cr90_corvette.json‎ ‎…ta/individual_items/1_cr90_corvette.json‎data/1_cr90_corvette.json renamed to data/individual_items/1_cr90_corvette.json b/‎data/1_cr90_corvette.json‎ ‎…ta/individual_items/1_cr90_corvette.json‎data/1_cr90_corvette.json renamed to data/individual_items/1_cr90_corvette.json
diff --git a/‎data/1_luke_skywalker.json‎ ‎…a/individual_items/1_luke_skywalker.json‎data/1_luke_skywalker.json renamed to data/individual_items/1_luke_skywalker.json b/‎data/1_luke_skywalker.json‎ ‎…a/individual_items/1_luke_skywalker.json‎data/1_luke_skywalker.json renamed to data/individual_items/1_luke_skywalker.json
diff --git a/‎data/1_tatooine.json‎ ‎data/individual_items/1_tatooine.json‎data/1_tatooine.json renamed to data/individual_items/1_tatooine.json b/‎data/1_tatooine.json‎ ‎data/individual_items/1_tatooine.json‎data/1_tatooine.json renamed to data/individual_items/1_tatooine.json
diff --git a/‎data/2_alderaan.json‎ ‎data/individual_items/2_alderaan.json‎data/2_alderaan.json renamed to data/individual_items/2_alderaan.json b/‎data/2_alderaan.json‎ ‎data/individual_items/2_alderaan.json‎data/2_alderaan.json renamed to data/individual_items/2_alderaan.json
diff --git a/‎data/2_c_3po.json‎ ‎data/individual_items/2_c_3po.json‎data/2_c_3po.json renamed to data/individual_items/2_c_3po.json b/‎data/2_c_3po.json‎ ‎data/individual_items/2_c_3po.json‎data/2_c_3po.json renamed to data/individual_items/2_c_3po.json
@@ -57,6 +57,15 @@ After creating a cluster, you can create a new bucket by following the steps bel
 
 Before we can index and search data, we need to transform it into a format that can be used by the vector search engine. We will be using [Couchbase Vector Search](https://docs.couchbase.com/server/current/fts/fts-vector-search.html) for this workshop.
 
+There are two options in this workshop to generate vector embeddings from data:
+
+1. Use the `/embed` endpoint provided in this repository to transform the data. *You need an OpenAI API key to use this option.*
+2. Import directly the data with *already generated embeddings* into the Couchbase bucket. You can use the data provided in the `./data/individual_items_with_embedding` directory.
+
+Follow the instructions below for the option you choose.
+
+### Option 1: Use the `/embed` Endpoint
+
 Provided in this repository is an Express.js application that will expose a `/embed` endpoint to transform the data.
 
 The Codespace environment already has all the dependencies installed. You can start the Express.js application by running the following command:
@@ -65,15 +74,109 @@ The Codespace environment already has all the dependencies installed. You can st
 node server.js
 ```
 
-The repository also has a sample set of data in the `./data` directory. You can transform this data by making a POST request to the `/embed` endpoint providing the paths to the data files as an array in the request body.
+The repository also has a sample set of data in the `./data/individual_items` directory. You can transform this data by making a POST request to the `/embed` endpoint providing the paths to the data files as an array in the request body.
 
 ```bash
 curl -X POST http://localhost:3000/embed -H "Content-Type: application/json" -d '["./data/data1.json", "./data/data2.json"]'
 ```
 
 The data has now been converted into vector embeddings and stored in the Couchbase bucket that you created earlier.
 
+### Option 2: Import Data with Embeddings
+
+If you choose to import the data directly, you can use the data provided in the `./data/individual_items_with_embedding` directory. The data is already in the format required to enable vector search on it.
+
+Once you have opened this repositority in a [GitHub Codespace](https://codespaces.new/hummusonrails/vector-search-nodejs-workshop), you can import the data with the generated embeddings using the [Couchbase shell](https://couchbase.sh/docs/#_importing_data) from the command line.
+
+#### Edit the Config File
+
+First, edit the `./config_file/config` file with your Couchbase Capella information.
+
+Under the `[[cluster]]` section:
+
+- Replace the empty string value for `identifier` with the name of the cluster you created earlier.
+- Replace the empty string value for `connstr` with the connection string to your cluster.
+  - Found in `Menu > Connect`
+![](workshop_images/menu_with_connect_highlighted.png)
+- Replace the empty string for `default_bucket` with the name of the bucket you created earlier.
+- Replace the empty strings for `username` and password with the username and password of your Couchbase Capella account.
+  - Found in `Menu > Settings > Cluster Access`
+  ![](workshop_images/menu_with_settings_highlighted.png)
+- Replace the empty string for `capella_organization` with the name of your organization.
+  - Found by clicking on your avatar icon (usually your initials) then `Organizations`
+  - Change the name of your organization if multiple words to use dashes instead of spaces, i.e. "My Organization" becomes "my-organization".
+  ![](workshop_images/menu_with_organizations_highlighted.png)
+
+Under the `[[capella-organization]] section:
+
+- Replace the `identifier` empty string value with the name of your organization like the last step above.
+- Replace the `access-key` and `secret_key` empty strings values with the access key for your organization.
+  - Found in `Menu > Settings > API Keys`
+  ![](workshop_images/menu_with_api_keys_highlighted.png)
+- Replace the `default-project` empty string value with the name of the project you created earlier.
+  - Found in the top-level view of all your clusters.
+  ![](workshop_images/cluster_list_with_project_name.png)
+
+#### Import Data with Couchbase Shell
+
+Change into the directory where the data files with embeddings are:
+
+```bash
+cd data/individual_items_with_embedding
+```
+
+Open up Couchbase shell passing in an argument with the location of the config file defining your Couchbase information:
+
+```bash
+cbsh --config-dir ../config_file
+```
+
+Once in the shell, run the `nodes` command to just perform a sanity check that you are connected to the correct cluster.
+
+```bash
+> nodes
+```
+
+This should output something similar to the following:
+
+```bash
+╭───┬───────────┬────────────────┬─────────┬──────────────────────────┬───────────────────────┬───────────────────────────┬──────────────┬─────────────┬─────────╮
+│ # │  cluster  │    hostname    │ status  │         services         │        version        │            os             │ memory_total │ memory_free │ capella │
+├───┼───────────┼────────────────┼─────────┼──────────────────────────┼───────────────────────┼───────────────────────────┼──────────────┼─────────────┼─────────┤
+│ 0 │ dev.local │ 127.0.0.1:8091 │ healthy │ search,indexing,kv,query │ 8.0.0-1246-enterprise │ x86_64-apple-darwin19.6.0 │  34359738368 │ 12026126336 │ false   │
+╰───┴───────────┴────────────────┴─────────┴──────────────────────────┴───────────────────────┴───────────────────────────┴──────────────┴─────────────┴─────────╯
+```
+
+Now, import the data into the bucket you created earlier:
+
+```bash
+> ls *_with_embedding.json | each { |it| open $it.name | wrap content | insert id $in.content._default.name } | doc upsert
+```
+
+Once this is done, you can perform a sanity check to ensure the documents were inserted by running a query to select just one:
+
+```bash
+> query "select * from name_of_your_bucket._default._default limit 1"
+```
+
+Replace the `name_of_your_bucket` with the name of your bucket you created.
+
 ## Index Data
 
-Once the vector embeddings have been stored in the Couchbase bucket, we can create a vector search index to enable similarity search. 
+Once the vector embeddings have been stored in the Couchbase bucket, we can create a vector search index to enable similarity search.
 
+You will use Couchbase Shell to perform this action as well.
+
+Run the following command from inside the shell:
+
+```bash
+> vector create-index --bucket name_of_your_bucket --similarity-metric dot_product vector-search-index embedding 1536
+```
+
+Replace the `name_of_your_bucket` with the name of your bucket you created.
+
+You can perform a santity check to ensure the index was created by querying for all the indexes and you should see the `vector_search_index` in the list:
+
+```bash
+> query indexes
+```
@@ -0,0 +1,26 @@
+version = 1
+llms = []
+
+[[cluster]]
+identifier = "" # This is the name of the cluster you created
+connstr = "" # This is the connection string for the cluster and can be found in the Capella UI
+default-bucket = "" # This is the name of the bucket you created
+username = "" # This is the username you created in the connect settings in the Capella UI
+password = "" # This is the password you created in the connect settings in the Capella UI
+default-collection = "_default" # Keep these as is unless you changed the defaults in the Capella UI
+default-scope = "_default" # Keep these as is unless you changed the defaults in the Capella UI
+data-timeout = "10s" # Keep as is
+connect-timeout = "1m 15s" # Keep as is
+search-timeout = "1m 15s" # Keep as is
+analytics-timeout = "1m 15s" # Keep as is
+management-timeout = "1m 15s" # Keep as is
+transaction-timeout = "1m 15s" # Keep as is
+tls-enabled = true # Keep as is
+tls-accept-all-certs = true # Keep as is
+capella-organization = "" # This is the name of the your organization, if multiple words use a hyphen and lowercase
+
+[[capella-organization]]
+identifier = "" # This is the name of the your organization, if multiple words use a hyphen and lowercase
+access-key = "" # This is the access key for your organization found in the Capella UI
+secret-key =  "" # This is the secret key for your organization found in the Capella UI
+default-project = "" # This is the name of the project you created where all your clusters are stored