fix: Correct typo in timeseries_analysis.ipynb to resolve BadRequest

shuoweil · shuoweil · commit 2794d38f99f5 · 2025-12-05T04:26:56.000Z
diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
@@ -230,6 +230,20 @@ def _fit(
         """
         X, y = utils.batch_convert_to_dataframe(X, y)
 
+        # Auto-convert Date to datetime for hourly/per_minute frequency
+        if self.data_frequency in ["hourly", "per_minute"]:
+            timestamp_col = X.columns[0]
+            if "date" in X[timestamp_col].dtype.name:
+                import warnings
+
+                warnings.warn(
+                    f"Converting Date column '{timestamp_col}' to datetime for "
+                    f"{self.data_frequency} frequency. This is required because "
+                    f"BigQuery ML doesn't support Date type with hourly frequency."
+                )
+                X = X.copy()
+                X[timestamp_col] = bpd.to_datetime(X[timestamp_col])
+
         if X.columns.size != 1:
             raise ValueError("Time series timestamp input X contain at least 1 column.")
         if y.columns.size != 1:
diff --git a/notebooks/ml/timeseries_analysis.ipynb b/notebooks/ml/timeseries_analysis.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf1403ce",
+   "metadata": {},
+   "source": [
+    "# Time Series Forecasting with BigFrames\n",
+    "\n",
+    "This notebook demonstrates time series forecasting using BigFrames with TimesFM and ARIMAPlus models on San Francisco bikeshare data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0b2db75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bigframes.pandas as bpd\n",
+    "bpd.options.display.repr_mode = \"anywidget\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83928f4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load bikeshare data, filtering for subscriber trips from 2018 onwards.\n",
+    "df = bpd.read_gbq(\"bigquery-public-data.san_francisco_bikeshare.bikeshare_trips\")\n",
+    "df = df[df[\"start_date\"] >= \"2018-01-01\"]\n",
+    "df = df[df[\"subscriber_type\"] == \"Subscriber\"]\n",
+    "\n",
+    "# Aggregate trips by hour.\n",
+    "df[\"trip_hour\"] = df[\"start_date\"] .dt.floor(\"h\")\n",
+    "df_grouped = df[[\"trip_hour\", \"trip_id\"]].groupby(\"trip_hour\").count().reset_index()\n",
+    "df_grouped = df_grouped.rename(columns={\"trip_id\": \"num_trips\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c43b7e65",
+   "metadata": {},
+   "source": [
+    "## Forecasting with TimesFM\n",
+    "\n",
+    "Use TimesFM to forecast the number of bikeshare trips for the last week of the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1096e154",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Forecast the last 168 hours (one week).\n",
+    "result = df_grouped.head(2842-168).ai.forecast(\n",
+    "    timestamp_column=\"trip_hour\",\n",
+    "    data_column=\"num_trips\",\n",
+    "    horizon=168\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90e80a82",
+   "metadata": {},
+   "source": [
+    "## Forecasting with ARIMAPlus\n",
+    "\n",
+    "Forecast the same period using the ARIMAPlus model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f41e1cf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bigframes.ml import forecasting\n",
+    "\n",
+    "# Create and configure an ARIMAPlus model for hourly data.\n",
+    "model = forecasting.ARIMAPlus(\n",
+    "    auto_arima_max_order=5,  # Reduce runtime for large datasets\n",
+    "    data_frequency=\"hourly\",\n",
+    "    horizon=168\n",
+    ")\n",
+    "\n",
+    "# Use the same training data as the TimesFM model.\n",
+    "X = df_grouped.head(2842-168)[[\"trip_hour\"]]\n",
+    "y = df_grouped.head(2842-168)[[\"num_trips\"]]\n",
+    "\n",
+    "model.fit(X, y)\n",
+    "predictions = model.predict(horizon=168, confidence_level=0.95)\n",
+    "predictions\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "015804c3",
+   "metadata": {},
+   "source": [
+    "## Multiple Time Series Forecasting\n",
+    "\n",
+    "Use ARIMAPlus to forecast multiple time series simultaneously. The `id_col` parameter differentiates each series."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dbe6c48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter for specific stations to create distinct time series.\n",
+    "df_multi = bpd.read_gbq(\"bigquery-public-data.san_francisco_bikeshare.bikeshare_trips\")\n",
+    "df_multi = df_multi[df_multi[\"start_station_name\"] .str.contains(\"Market|Powell|Embarcadero\")]\n",
+    "\n",
+    "# Group data by station and date.\n",
+    "features = bpd.DataFrame({\n",
+    "    \"start_station_name\": df_multi[\"start_station_name\"],\n",
+    "    \"num_trips\": df_multi[\"start_date\"],\n",
+    "    \"date\": df_multi[\"start_date\"] .dt.date,\n",
+    "})\n",
+    "num_trips = features.groupby(\n",
+    "    [\"start_station_name\", \"date\"], as_index=False\n",
+    " ).count()\n",
+    "\n",
+    "# Fit the model, identifying each series by 'start_station_name'.\n",
+    "model.fit(\n",
+    "    num_trips[[\"date\"]],\n",
+    "    num_trips[[\"num_trips\"]],\n",
+    "    id_col=num_trips[[\"start_station_name\"]]\n",
+    ")\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ed68c3c",
+   "metadata": {},
+   "source": [
+    "## Visualize Forecasting Results\n",
+    "\n",
+    "Plot the TimesFM forecast results against the actual data to visually assess model performance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e7a29e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare forecast data for plotting.\n",
+    "result = result.sort_values(\"forecast_timestamp\")\n",
+    "result = result[[\"forecast_timestamp\", \"forecast_value\"]]\n",
+    "result = result.rename(columns={\n",
+    "    \"forecast_timestamp\": \"trip_hour\",\n",
+    "    \"forecast_value\": \"num_trips_forecast\"\n",
+    "})\n",
+    "\n",
+    "# Combine actual and forecasted data for the last 4 weeks.\n",
+    "df_all = bpd.concat([df_grouped, result])\n",
+    "df_all = df_all.tail(672)\n",
+    "\n",
+    "# Plot actual vs. forecasted trips.\n",
+    "df_all.plot.line()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
@@ -190,3 +190,24 @@ def test_arima_plus_model_fit_params(
     assert reloaded_model.min_time_series_length == 10
     assert reloaded_model.trend_smoothing_window_size == 5
     assert reloaded_model.decompose_time_series is False
+
+
+def test_arima_plus_model_fit_date_conversion(time_series_df_default_index):
+    model = forecasting.ARIMAPlus(data_frequency="hourly")
+
+    # Arrange: Create a dataframe with a date column to test auto-conversion
+    df = time_series_df_default_index.copy()
+    df["parsed_date"] = df["parsed_date"].dt.date
+
+    X_train = df[["parsed_date"]]
+    y_train = df[["total_visits"]]
+
+    with pytest.warns(
+        UserWarning,
+        match="Converting Date column 'parsed_date' to datetime for hourly frequency.",
+    ):
+        # Act
+        model.fit(X_train, y_train)
+
+    # Assert
+    assert model._bqml_model is not None