Usability improvements for 25.08 release (#324)

shashank3959 · web-flow · commit 265e9d7fdaaf · 2025-08-07T10:09:39.000-07:00
- Remedy review feedback
- Enhancements to fully automate such that notebooks can run
without interaction end to end

Signed-off-by: Shashank Verma &lt;shashankv@nvidia.com&gt;
diff --git a/nemo/data-flywheel/tool-calling/2_finetuning_and_inference.ipynb b/nemo/data-flywheel/tool-calling/2_finetuning_and_inference.ipynb
@@ -594,7 +594,7 @@
     "\n",
     "customization = client_with_wandb.customization.jobs.create(\n",
     "    name=\"llama-3.2-1b-xlam-ft\",\n",
-    "    output_model=f\"{NMS_NAMESPACE}/llama-3.2-1b-xlam-run1\",\n",
+    "    output_model=CUSTOM_MODEL,\n",
     "    config=f\"{BASE_MODEL}@{BASE_MODEL_VERSION}\",\n",
     "    dataset={\"name\": DATASET_NAME, \"namespace\": NMS_NAMESPACE},\n",
     "    hyperparameters={\n",
@@ -952,12 +952,57 @@
     "print(\"Job Status:\", json.dumps(job_status.model_dump(), indent=2, default=str))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e84514e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add wait job function to wait for the customization job to complete\n",
+    "\n",
+    "from time import sleep, time\n",
+    "\n",
+    "def wait_job(nemo_client, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n",
+    "    \"\"\"Helper for waiting an eval job using SDK.\"\"\"\n",
+    "    start_time = time()\n",
+    "    job = nemo_client.customization.jobs.retrieve(job_id=job_id)\n",
+    "    status = job.status\n",
+    "\n",
+    "    while (status in [\"pending\", \"created\", \"running\"]):\n",
+    "        # Check for timeout\n",
+    "        if time() - start_time > timeout:\n",
+    "            raise RuntimeError(f\"Took more than {timeout} seconds.\")\n",
+    "\n",
+    "        # Sleep before polling again\n",
+    "        sleep(polling_interval)\n",
+    "\n",
+    "        # Fetch updated status and progress\n",
+    "        job = nemo_client.customization.jobs.retrieve(job_id=job_id)\n",
+    "        status = job.status\n",
+    "        progress = 0.0\n",
+    "        if status == \"running\" and job.status_details:\n",
+    "            progress = job.status_details.percentage_done or 0.0\n",
+    "        elif status == \"completed\":\n",
+    "            progress = 100\n",
+    "\n",
+    "        print(f\"Job status: {status} after {time() - start_time:.2f} seconds. Progress: {progress}%\")\n",
+    "\n",
+    "\n",
+    "    return job\n",
+    "\n",
+    "job = wait_job(nemo_client, JOB_ID, polling_interval=5, timeout=2400)\n",
+    "\n",
+    "# Wait for 2 minutes, because sometimes, the job is finished, but the finetuned model is not ready in NIM yet.\n",
+    "sleep(120)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "42b721be-8ca0-4e8f-99a7-5eb12ea1b47f",
    "metadata": {},
    "source": [
-    "**IMPORTANT:** Monitor the job status. Ensure training is completed before proceeding by observing the `percentage_done` key in the response frame."
+    "**IMPORTANT:** At this point, the customization job should be completed. If waiting for the job to finish failed or the status is not `\"completed\"`, please check the logs (`job.status_details.status_logs`)."
    ]
   },
   {
diff --git a/nemo/data-flywheel/tool-calling/3_model_evaluation.ipynb b/nemo/data-flywheel/tool-calling/3_model_evaluation.ipynb
@@ -124,7 +124,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "CUSTOMIZED_MODEL = \"\" # paste from the previous notebook"
+    "CUSTOMIZED_MODEL = CUSTOM_MODEL # paste from the previous notebook"
    ]
   },
   {
diff --git a/nemo/data-flywheel/tool-calling/4_adding_safety_guardrails.ipynb b/nemo/data-flywheel/tool-calling/4_adding_safety_guardrails.ipynb
@@ -101,7 +101,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "CUSTOMIZED_MODEL = \"\" # paste from the previous notebook"
+    "CUSTOMIZED_MODEL = CUSTOM_MODEL # paste from the previous notebook"
    ]
   },
   {
@@ -262,7 +262,7 @@
     "# Create guardrails configuration\n",
     "guardrail_config = nemo_client.guardrail.configs.create(\n",
     "    name=\"toolcalling\",\n",
-    "    namespace=\"default\",\n",
+    "    namespace=NMS_NAMESPACE,\n",
     "    data={\n",
     "      \"models\": [\n",
     "        { \n",
@@ -454,7 +454,7 @@
     "                }\n",
     "            ],\n",
     "            guardrails={\n",
-    "                \"config_id\": \"toolcalling\"\n",
+    "                \"config_id\": f\"{NMS_NAMESPACE}/toolcalling\"\n",
     "            },\n",
     "            temperature=0.2,\n",
     "            top_p=1\n",
diff --git a/nemo/data-flywheel/tool-calling/config.py b/nemo/data-flywheel/tool-calling/config.py
@@ -19,3 +19,6 @@
 # (Optional) Configure the base model. Must be one supported by the NeMo Customizer deployment!
 BASE_MODEL = "meta/llama-3.2-1b-instruct"
 BASE_MODEL_VERSION = "v1.0.0+A100"
+
+# (Optional) Configure the custom model. Ensure the custom model name can be pass to the other notebooks
+CUSTOM_MODEL = f"{NMS_NAMESPACE}/llama-3.2-1b-xlam-run1@v1"

Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@`
`124`	`124`	`"metadata": {},`
`125`	`125`	`"outputs": [],`
`126`	`126`	`"source": [`
`127`		`- "CUSTOMIZED_MODEL = \"\" # paste from the previous notebook"`
	`127`	`+ "CUSTOMIZED_MODEL = CUSTOM_MODEL # paste from the previous notebook"`
`128`	`128`	`]`
`129`	`129`	`},`
`130`	`130`	`{`