minor update

shuoweil · shuoweil · commit 51b336bdae45 · 2025-12-12T04:03:20.000Z
diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 from typing import List, Optional
+import warnings
 
 from google.cloud import bigquery
 
@@ -234,8 +235,6 @@ def _fit(
         if self.data_frequency in ["hourly", "per_minute"]:
             timestamp_col = X.columns[0]
             if "date" in X[timestamp_col].dtype.name:
-                import warnings
-
                 warnings.warn(
                     f"Converting Date column '{timestamp_col}' to datetime for "
                     f"{self.data_frequency} frequency. This is required because "
diff --git a/notebooks/ml/timeseries_analysis.ipynb b/notebooks/ml/timeseries_analysis.ipynb
@@ -18,6 +18,7 @@
    "outputs": [],
    "source": [
     "import bigframes.pandas as bpd\n",
+    "from bigframes.ml import forecasting\n",
     "bpd.options.display.repr_mode = \"anywidget\""
    ]
   },
@@ -38,15 +39,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load the bikeshare dataset from the public BigQuery repository.\n",
     "df = bpd.read_gbq(\"bigquery-public-data.san_francisco_bikeshare.bikeshare_trips\")\n",
-    "\n",
-    "# Filter the data to focus on a specific time period and user type.\n",
     "df = df[df[\"start_date\"] >= \"2018-01-01\"]\n",
     "df = df[df[\"subscriber_type\"] == \"Subscriber\"]\n",
-    "\n",
-    "# Resample the data to an hourly frequency by counting the number of trips in each hour.\n",
-    "df[\"trip_hour\"] = df[\"start_date\"] .dt.floor(\"h\")\n",
+    "df[\"trip_hour\"] = df[\"start_date\"].dt.floor(\"h\")\n",
     "df_grouped = df[[\"trip_hour\", \"trip_id\"]].groupby(\"trip_hour\").count().reset_index()\n",
     "df_grouped = df_grouped.rename(columns={\"trip_id\": \"num_trips\"})"
    ]
@@ -80,7 +76,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 58.7 MB in 16 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:8904286c-644a-409d-9ba0-fe308a3382bf&page=queryresults\">Job bigframes-dev:US.8904286c-644a-409d-9ba0-fe308a3382bf details</a>]\n",
+       "    Query processed 58.7 MB in 16 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:b91a9e6f-d00a-44f6-afe1-255e25945a1d&page=queryresults\">Job bigframes-dev:US.b91a9e6f-d00a-44f6-afe1-255e25945a1d details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -94,7 +90,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 7.1 kB in 16 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f859016e-cf03-4581-b176-e50c559f9380&page=queryresults\">Job bigframes-dev:US.f859016e-cf03-4581-b176-e50c559f9380 details</a>]\n",
+       "    Query processed 7.1 kB in 9 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1f3bfd8b-5740-4895-be14-6a8b92a4f3b1&page=queryresults\">Job bigframes-dev:US.1f3bfd8b-5740-4895-be14-6a8b92a4f3b1 details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -135,12 +131,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f53e2ae48801458ab42facd6ebe10728",
+       "model_id": "8a4d64e6cf4844018c1b8593d1d99e05",
        "version_major": 2,
        "version_minor": 1
       },
       "text/plain": [
-       "<bigframes.display.anywidget.TableWidget object at 0x7f62f2b13380>"
+       "<bigframes.display.anywidget.TableWidget object at 0x7efcefb3bb60>"
       ]
      },
      "metadata": {},
@@ -159,10 +155,6 @@
     }
    ],
    "source": [
-    "# Use the TimesFM model to forecast the last 168 hours (one week).\n",
-    "# The `timestamp_column` specifies the time index of the series.\n",
-    "# The `data_column` is the value we want to forecast.\n",
-    "# The `horizon` defines how many steps into the future to predict.\n",
     "result = df_grouped.head(2842-168).ai.forecast(\n",
     "    timestamp_column=\"trip_hour\",\n",
     "    data_column=\"num_trips\",\n",
@@ -191,7 +183,7 @@
      "data": {
       "text/html": [
        "\n",
-       "    Query processed 1.8 MB in 40 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:02689ad8-e003-4911-acfc-be2e5c75652d&page=queryresults\">Job bigframes-dev:US.02689ad8-e003-4911-acfc-be2e5c75652d details</a>]\n",
+       "    Query processed 1.8 MB in 47 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:36efa98e-2843-4bc9-8225-06875236ef17&page=queryresults\">Job bigframes-dev:US.36efa98e-2843-4bc9-8225-06875236ef17 details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -205,7 +197,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 92.2 kB in a moment of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:9e29715c-0d9e-40db-8ead-d063791e61a5&page=queryresults\">Job bigframes-dev:US.9e29715c-0d9e-40db-8ead-d063791e61a5 details</a>]\n",
+       "    Query processed 92.2 kB in a moment of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:18805b62-a8e2-4c69-a5bf-97aa19df8095&page=queryresults\">Job bigframes-dev:US.18805b62-a8e2-4c69-a5bf-97aa19df8095 details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -233,7 +225,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 10.8 kB in 16 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:bf966f53-b73a-42af-8cb7-5d2384620e3d&page=queryresults\">Job bigframes-dev:US.bf966f53-b73a-42af-8cb7-5d2384620e3d details</a>]\n",
+       "    Query processed 10.8 kB in 11 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:a2b15286-6c7f-40a7-8009-045e5e4f3dbf&page=queryresults\">Job bigframes-dev:US.a2b15286-6c7f-40a7-8009-045e5e4f3dbf details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -274,12 +266,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a5360418703842cf9670f9d6ae77d8ee",
+       "model_id": "b3816017ab4440c7bcf258df4a0ceff8",
        "version_major": 2,
        "version_minor": 1
       },
       "text/plain": [
-       "<bigframes.display.anywidget.TableWidget object at 0x7f62f011fc50>"
+       "<bigframes.display.anywidget.TableWidget object at 0x7efcec227c50>"
       ]
      },
      "metadata": {},
@@ -298,27 +290,16 @@
     }
    ],
    "source": [
-    "from bigframes.ml import forecasting\n",
-    "\n",
-    "# Create and configure an ARIMAPlus model for hourly data.\n",
-    "# `auto_arima_max_order` is set to a lower value to reduce the training time.\n",
-    "# `data_frequency` is set to 'hourly' to match our aggregated data.\n",
     "model = forecasting.ARIMAPlus(\n",
     "    auto_arima_max_order=5,  # Reduce runtime for large datasets\n",
     "    data_frequency=\"hourly\",\n",
     "    horizon=168\n",
     ")\n",
-    "\n",
-    "# Prepare the training data by excluding the last week.\n",
-    "X = df_grouped.head(2842-168)[[\"trip_hour\"] ]\n",
-    "y = df_grouped.head(2842-168)[[\"num_trips\"] ]\n",
-    "\n",
-    "# Fit the model to the training data.\n",
+    "X = df_grouped.head(2842-168)[[\"trip_hour\"]]\n",
+    "y = df_grouped.head(2842-168)[[\"num_trips\"]]\n",
     "model.fit(\n",
     "    X, y\n",
     ")\n",
-    "\n",
-    "# Generate predictions for the specified horizon.\n",
     "predictions = model.predict(horizon=168, confidence_level=0.95)\n",
     "predictions"
    ]
@@ -343,7 +324,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 31.7 MB in 9 seconds of slot time.\n",
+       "    Query processed 31.7 MB in 10 seconds of slot time.\n",
        "    "
       ],
       "text/plain": [
@@ -357,7 +338,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 58.8 MB in 9 seconds of slot time.\n",
+       "    Query processed 58.8 MB in 7 seconds of slot time.\n",
        "    "
       ],
       "text/plain": [
@@ -389,30 +370,23 @@
     }
    ],
    "source": [
-    "# Prepare the TimesFM forecast data.\n",
-    "timesfm_result = result.sort_values(\"forecast_timestamp\")[ [ \"forecast_timestamp\", \"forecast_value\" ] ]\n",
+    "timesfm_result = result.sort_values(\"forecast_timestamp\")[[\"forecast_timestamp\", \"forecast_value\"]]\n",
     "timesfm_result = timesfm_result.rename(columns={\n",
     "    \"forecast_timestamp\": \"trip_hour\",\n",
     "    \"forecast_value\": \"timesfm_forecast\"\n",
     "})\n",
-    "\n",
-    "# Prepare the ARIMAPlus forecast data.\n",
-    "arimaplus_result = predictions.sort_values(\"forecast_timestamp\")[ [ \"forecast_timestamp\", \"forecast_value\" ] ]\n",
+    "arimaplus_result = predictions.sort_values(\"forecast_timestamp\")[[\"forecast_timestamp\", \"forecast_value\"]]\n",
     "arimaplus_result = arimaplus_result.rename(columns={\n",
     "    \"forecast_timestamp\": \"trip_hour\",\n",
     "    \"forecast_value\": \"arimaplus_forecast\"\n",
     "})\n",
-    "\n",
-    "# Merge the forecasts with the original data.\n",
     "df_all = df_grouped.merge(timesfm_result, on=\"trip_hour\", how=\"left\")\n",
     "df_all = df_all.merge(arimaplus_result, on=\"trip_hour\", how=\"left\")\n",
-    "\n",
-    "# Plot the last 4 weeks of data for comparison.\n",
-    "df_all.tail(672).plot.line(  \n",
-    "    x=\"trip_hour\",   \n",
-    "    y=[\"num_trips\", \"timesfm_forecast\", \"arimaplus_forecast\"],   \n",
-    "    rot=45,  \n",
-    "    title=\"Trip Forecasts Comparison\"  \n",
+    "df_all.tail(672).plot.line(\n",
+    "    x=\"trip_hour\",\n",
+    "    y=[\"num_trips\", \"timesfm_forecast\", \"arimaplus_forecast\"],\n",
+    "    rot=45,\n",
+    "    title=\"Trip Forecasts Comparison\"\n",
     ")"
    ]
   },
@@ -436,20 +410,20 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: TimeTravelCacheWarning: Reading cached table from 2025-12-12 03:22:23.615364+00:00 to avoid\n",
+      "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: TimeTravelCacheWarning: Reading cached table from 2025-12-12 03:47:11.144938+00:00 to avoid\n",
       "incompatibilies with previous reads of this table. To read the latest\n",
       "version, set `use_cache=False` or close the current session with\n",
       "Session.close() or bigframes.pandas.close_session().\n",
       "  return method(*args, **kwargs)\n",
-      "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/forecasting.py:239: UserWarning: Converting Date column 'date' to datetime for hourly frequency. This is required because BigQuery ML doesn't support Date type with hourly frequency.\n",
+      "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/forecasting.py:238: UserWarning: Converting Date column 'date' to datetime for hourly frequency. This is required because BigQuery ML doesn't support Date type with hourly frequency.\n",
       "  warnings.warn(\n"
      ]
     },
     {
      "data": {
       "text/html": [
        "\n",
-       "    Query processed 39.4 MB in 2 hours of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:c6f5d199-d64a-495f-b9f7-d6eaef4eb4b6&page=queryresults\">Job bigframes-dev:US.c6f5d199-d64a-495f-b9f7-d6eaef4eb4b6 details</a>]\n",
+       "    Query processed 39.4 MB in 2 hours of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:16d735c6-c885-447f-b513-5249ee8cb48a&page=queryresults\">Job bigframes-dev:US.16d735c6-c885-447f-b513-5249ee8cb48a details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -463,7 +437,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 32.0 MB in 5 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:83063588-a945-4405-812a-87305cf640fe&page=queryresults\">Job bigframes-dev:US.83063588-a945-4405-812a-87305cf640fe details</a>]\n",
+       "    Query processed 32.0 MB in 3 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:c4cf8019-bdf6-461a-8753-e7314b837c29&page=queryresults\">Job bigframes-dev:US.c4cf8019-bdf6-461a-8753-e7314b837c29 details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -491,7 +465,7 @@
      "data": {
       "text/html": [
        "✅ Completed. \n",
-       "    Query processed 11.5 kB in 11 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:2c711378-2060-46a4-90b3-183a855463d4&page=queryresults\">Job bigframes-dev:US.2c711378-2060-46a4-90b3-183a855463d4 details</a>]\n",
+       "    Query processed 11.5 kB in 8 seconds of slot time. [<a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:13663bcd-b3e2-471c-b7e7-50c260c4cfdd&page=queryresults\">Job bigframes-dev:US.13663bcd-b3e2-471c-b7e7-50c260c4cfdd details</a>]\n",
        "    "
       ],
       "text/plain": [
@@ -532,12 +506,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fc8931fb4cd0464ea1ebd790eda7d909",
+       "model_id": "5cd08895656f45f5bfe39fd6bb26855f",
        "version_major": 2,
        "version_minor": 1
       },
       "text/plain": [
-       "<bigframes.display.anywidget.TableWidget object at 0x7f62f0068550>"
+       "<bigframes.display.anywidget.TableWidget object at 0x7efcec160550>"
       ]
      },
      "metadata": {},
@@ -556,28 +530,21 @@
     }
    ],
    "source": [
-    "# Filter for specific stations to create a dataset with multiple distinct time series.\n",
     "df_multi = bpd.read_gbq(\"bigquery-public-data.san_francisco_bikeshare.bikeshare_trips\")\n",
-    "df_multi = df_multi[df_multi[\"start_station_name\"] .str.contains(\"Market|Powell|Embarcadero\")]\n",
-    "\n",
-    "# Group the data by station and date to create a time series for each station.\n",
+    "df_multi = df_multi[df_multi[\"start_station_name\"].str.contains(\"Market|Powell|Embarcadero\")]\n",
     "features = bpd.DataFrame({\n",
     "    \"start_station_name\": df_multi[\"start_station_name\"],\n",
     "    \"num_trips\": df_multi[\"start_date\"],\n",
-    "    \"date\": df_multi[\"start_date\"] .dt.date,\n",
+    "    \"date\": df_multi[\"start_date\"].dt.date,\n",
     "})\n",
     "num_trips = features.groupby(\n",
-    "    [ \"start_station_name\", \"date\" ], as_index=False\n",
-    " ).count()\n",
-    "\n",
-    "# Fit the model, using the 'start_station_name' column to identify each individual time series.\n",
-    "model.fit (\n",
+    "    [\"start_station_name\", \"date\"], as_index=False\n",
+    ").count()\n",
+    "model.fit(\n",
     "    num_trips[[\"date\"]],\n",
     "    num_trips[[\"num_trips\"]],\n",
-    "    id_col=num_trips[[\"start_station_name\"] ]\n",
+    "    id_col=num_trips[[\"start_station_name\"]]\n",
     ")\n",
-    "\n",
-    "# Predict the future values for each time series.\n",
     "predictions_multi = model.predict()\n",
     "predictions_multi"
    ]
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
@@ -17,7 +17,7 @@
 from bigframes.ml import forecasting
 from bigframes.testing import utils
 
-ARIMA_EVALUATE_OUTPUT_COL = [
+ARIMA_EVALUATE_OUTPUT_COLUMNS = [
     "non_seasonal_p",
     "non_seasonal_d",
     "non_seasonal_q",
@@ -106,9 +106,9 @@ def test_arima_plus_model_fit_summary(
     curr_model = arima_model_w_id if id_col_name else arima_model
     result = curr_model.summary().to_pandas()
     expected_columns = (
-        [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL
+        [id_col_name] + ARIMA_EVALUATE_OUTPUT_COLUMNS
         if id_col_name
-        else ARIMA_EVALUATE_OUTPUT_COL
+        else ARIMA_EVALUATE_OUTPUT_COLUMNS
     )
     utils.check_pandas_df_schema_and_index(
         result, columns=expected_columns, index=2 if id_col_name else 1