|
594 | 594 | "\n", |
595 | 595 | "customization = client_with_wandb.customization.jobs.create(\n", |
596 | 596 | " name=\"llama-3.2-1b-xlam-ft\",\n", |
597 | | - " output_model=f\"{NMS_NAMESPACE}/llama-3.2-1b-xlam-run1\",\n", |
| 597 | + " output_model=CUSTOM_MODEL,\n", |
598 | 598 | " config=f\"{BASE_MODEL}@{BASE_MODEL_VERSION}\",\n", |
599 | 599 | " dataset={\"name\": DATASET_NAME, \"namespace\": NMS_NAMESPACE},\n", |
600 | 600 | " hyperparameters={\n", |
|
952 | 952 | "print(\"Job Status:\", json.dumps(job_status.model_dump(), indent=2, default=str))" |
953 | 953 | ] |
954 | 954 | }, |
| 955 | + { |
| 956 | + "cell_type": "code", |
| 957 | + "execution_count": null, |
| 958 | + "id": "e84514e1", |
| 959 | + "metadata": {}, |
| 960 | + "outputs": [], |
| 961 | + "source": [ |
| 962 | + "# Add wait job function to wait for the customization job to complete\n", |
| 963 | + "\n", |
| 964 | + "from time import sleep, time\n", |
| 965 | + "\n", |
| 966 | + "def wait_job(nemo_client, job_id: str, polling_interval: int = 10, timeout: int = 6000):\n", |
| 967 | + " \"\"\"Helper for waiting an eval job using SDK.\"\"\"\n", |
| 968 | + " start_time = time()\n", |
| 969 | + " job = nemo_client.customization.jobs.retrieve(job_id=job_id)\n", |
| 970 | + " status = job.status\n", |
| 971 | + "\n", |
| 972 | + " while (status in [\"pending\", \"created\", \"running\"]):\n", |
| 973 | + " # Check for timeout\n", |
| 974 | + " if time() - start_time > timeout:\n", |
| 975 | + " raise RuntimeError(f\"Took more than {timeout} seconds.\")\n", |
| 976 | + "\n", |
| 977 | + " # Sleep before polling again\n", |
| 978 | + " sleep(polling_interval)\n", |
| 979 | + "\n", |
| 980 | + " # Fetch updated status and progress\n", |
| 981 | + " job = nemo_client.customization.jobs.retrieve(job_id=job_id)\n", |
| 982 | + " status = job.status\n", |
| 983 | + " progress = 0.0\n", |
| 984 | + " if status == \"running\" and job.status_details:\n", |
| 985 | + " progress = job.status_details.percentage_done or 0.0\n", |
| 986 | + " elif status == \"completed\":\n", |
| 987 | + " progress = 100\n", |
| 988 | + "\n", |
| 989 | + " print(f\"Job status: {status} after {time() - start_time:.2f} seconds. Progress: {progress}%\")\n", |
| 990 | + "\n", |
| 991 | + "\n", |
| 992 | + " return job\n", |
| 993 | + "\n", |
| 994 | + "job = wait_job(nemo_client, JOB_ID, polling_interval=5, timeout=2400)\n", |
| 995 | + "\n", |
| 996 | + "# Wait for 2 minutes, because sometimes, the job is finished, but the finetuned model is not ready in NIM yet.\n", |
| 997 | + "sleep(120)" |
| 998 | + ] |
| 999 | + }, |
955 | 1000 | { |
956 | 1001 | "cell_type": "markdown", |
957 | 1002 | "id": "42b721be-8ca0-4e8f-99a7-5eb12ea1b47f", |
958 | 1003 | "metadata": {}, |
959 | 1004 | "source": [ |
960 | | - "**IMPORTANT:** Monitor the job status. Ensure training is completed before proceeding by observing the `percentage_done` key in the response frame." |
| 1005 | + "**IMPORTANT:** At this point, the customization job should be completed. If waiting for the job to finish failed or the status is not `\"completed\"`, please check the logs (`job.status_details.status_logs`)." |
961 | 1006 | ] |
962 | 1007 | }, |
963 | 1008 | { |
|
0 commit comments