Skip to content

Commit 2794d38

Browse files
committed
fix: Correct typo in timeseries_analysis.ipynb to resolve BadRequest
1 parent 7e959b9 commit 2794d38

File tree

3 files changed

+233
-0
lines changed

3 files changed

+233
-0
lines changed

bigframes/ml/forecasting.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,20 @@ def _fit(
230230
"""
231231
X, y = utils.batch_convert_to_dataframe(X, y)
232232

233+
# Auto-convert Date to datetime for hourly/per_minute frequency
234+
if self.data_frequency in ["hourly", "per_minute"]:
235+
timestamp_col = X.columns[0]
236+
if "date" in X[timestamp_col].dtype.name:
237+
import warnings
238+
239+
warnings.warn(
240+
f"Converting Date column '{timestamp_col}' to datetime for "
241+
f"{self.data_frequency} frequency. This is required because "
242+
f"BigQuery ML doesn't support Date type with hourly frequency."
243+
)
244+
X = X.copy()
245+
X[timestamp_col] = bpd.to_datetime(X[timestamp_col])
246+
233247
if X.columns.size != 1:
234248
raise ValueError("Time series timestamp input X contain at least 1 column.")
235249
if y.columns.size != 1:
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "cf1403ce",
6+
"metadata": {},
7+
"source": [
8+
"# Time Series Forecasting with BigFrames\n",
9+
"\n",
10+
"This notebook demonstrates time series forecasting using BigFrames with TimesFM and ARIMAPlus models on San Francisco bikeshare data."
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"id": "c0b2db75",
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"import bigframes.pandas as bpd\n",
21+
"bpd.options.display.repr_mode = \"anywidget\""
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": null,
27+
"id": "83928f4d",
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"# Load bikeshare data, filtering for subscriber trips from 2018 onwards.\n",
32+
"df = bpd.read_gbq(\"bigquery-public-data.san_francisco_bikeshare.bikeshare_trips\")\n",
33+
"df = df[df[\"start_date\"] >= \"2018-01-01\"]\n",
34+
"df = df[df[\"subscriber_type\"] == \"Subscriber\"]\n",
35+
"\n",
36+
"# Aggregate trips by hour.\n",
37+
"df[\"trip_hour\"] = df[\"start_date\"] .dt.floor(\"h\")\n",
38+
"df_grouped = df[[\"trip_hour\", \"trip_id\"]].groupby(\"trip_hour\").count().reset_index()\n",
39+
"df_grouped = df_grouped.rename(columns={\"trip_id\": \"num_trips\"})"
40+
]
41+
},
42+
{
43+
"cell_type": "markdown",
44+
"id": "c43b7e65",
45+
"metadata": {},
46+
"source": [
47+
"## Forecasting with TimesFM\n",
48+
"\n",
49+
"Use TimesFM to forecast the number of bikeshare trips for the last week of the dataset."
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": null,
55+
"id": "1096e154",
56+
"metadata": {},
57+
"outputs": [],
58+
"source": [
59+
"# Forecast the last 168 hours (one week).\n",
60+
"result = df_grouped.head(2842-168).ai.forecast(\n",
61+
" timestamp_column=\"trip_hour\",\n",
62+
" data_column=\"num_trips\",\n",
63+
" horizon=168\n",
64+
")\n",
65+
"result"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"id": "90e80a82",
71+
"metadata": {},
72+
"source": [
73+
"## Forecasting with ARIMAPlus\n",
74+
"\n",
75+
"Forecast the same period using the ARIMAPlus model."
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": null,
81+
"id": "f41e1cf0",
82+
"metadata": {},
83+
"outputs": [],
84+
"source": [
85+
"from bigframes.ml import forecasting\n",
86+
"\n",
87+
"# Create and configure an ARIMAPlus model for hourly data.\n",
88+
"model = forecasting.ARIMAPlus(\n",
89+
" auto_arima_max_order=5, # Reduce runtime for large datasets\n",
90+
" data_frequency=\"hourly\",\n",
91+
" horizon=168\n",
92+
")\n",
93+
"\n",
94+
"# Use the same training data as the TimesFM model.\n",
95+
"X = df_grouped.head(2842-168)[[\"trip_hour\"]]\n",
96+
"y = df_grouped.head(2842-168)[[\"num_trips\"]]\n",
97+
"\n",
98+
"model.fit(X, y)\n",
99+
"predictions = model.predict(horizon=168, confidence_level=0.95)\n",
100+
"predictions\n"
101+
]
102+
},
103+
{
104+
"cell_type": "markdown",
105+
"id": "015804c3",
106+
"metadata": {},
107+
"source": [
108+
"## Multiple Time Series Forecasting\n",
109+
"\n",
110+
"Use ARIMAPlus to forecast multiple time series simultaneously. The `id_col` parameter differentiates each series."
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": null,
116+
"id": "6dbe6c48",
117+
"metadata": {},
118+
"outputs": [],
119+
"source": [
120+
"# Filter for specific stations to create distinct time series.\n",
121+
"df_multi = bpd.read_gbq(\"bigquery-public-data.san_francisco_bikeshare.bikeshare_trips\")\n",
122+
"df_multi = df_multi[df_multi[\"start_station_name\"] .str.contains(\"Market|Powell|Embarcadero\")]\n",
123+
"\n",
124+
"# Group data by station and date.\n",
125+
"features = bpd.DataFrame({\n",
126+
" \"start_station_name\": df_multi[\"start_station_name\"],\n",
127+
" \"num_trips\": df_multi[\"start_date\"],\n",
128+
" \"date\": df_multi[\"start_date\"] .dt.date,\n",
129+
"})\n",
130+
"num_trips = features.groupby(\n",
131+
" [\"start_station_name\", \"date\"], as_index=False\n",
132+
" ).count()\n",
133+
"\n",
134+
"# Fit the model, identifying each series by 'start_station_name'.\n",
135+
"model.fit(\n",
136+
" num_trips[[\"date\"]],\n",
137+
" num_trips[[\"num_trips\"]],\n",
138+
" id_col=num_trips[[\"start_station_name\"]]\n",
139+
")\n",
140+
"model"
141+
]
142+
},
143+
{
144+
"cell_type": "markdown",
145+
"id": "4ed68c3c",
146+
"metadata": {},
147+
"source": [
148+
"## Visualize Forecasting Results\n",
149+
"\n",
150+
"Plot the TimesFM forecast results against the actual data to visually assess model performance."
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"id": "0e7a29e2",
157+
"metadata": {},
158+
"outputs": [],
159+
"source": [
160+
"# Prepare forecast data for plotting.\n",
161+
"result = result.sort_values(\"forecast_timestamp\")\n",
162+
"result = result[[\"forecast_timestamp\", \"forecast_value\"]]\n",
163+
"result = result.rename(columns={\n",
164+
" \"forecast_timestamp\": \"trip_hour\",\n",
165+
" \"forecast_value\": \"num_trips_forecast\"\n",
166+
"})\n",
167+
"\n",
168+
"# Combine actual and forecasted data for the last 4 weeks.\n",
169+
"df_all = bpd.concat([df_grouped, result])\n",
170+
"df_all = df_all.tail(672)\n",
171+
"\n",
172+
"# Plot actual vs. forecasted trips.\n",
173+
"df_all.plot.line()"
174+
]
175+
}
176+
],
177+
"metadata": {
178+
"kernelspec": {
179+
"display_name": "venv",
180+
"language": "python",
181+
"name": "python3"
182+
},
183+
"language_info": {
184+
"codemirror_mode": {
185+
"name": "ipython",
186+
"version": 3
187+
},
188+
"file_extension": ".py",
189+
"mimetype": "text/x-python",
190+
"name": "python",
191+
"nbconvert_exporter": "python",
192+
"pygments_lexer": "ipython3",
193+
"version": "3.11.10"
194+
}
195+
},
196+
"nbformat": 4,
197+
"nbformat_minor": 5
198+
}

tests/system/large/ml/test_forecasting.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,24 @@ def test_arima_plus_model_fit_params(
190190
assert reloaded_model.min_time_series_length == 10
191191
assert reloaded_model.trend_smoothing_window_size == 5
192192
assert reloaded_model.decompose_time_series is False
193+
194+
195+
def test_arima_plus_model_fit_date_conversion(time_series_df_default_index):
196+
model = forecasting.ARIMAPlus(data_frequency="hourly")
197+
198+
# Arrange: Create a dataframe with a date column to test auto-conversion
199+
df = time_series_df_default_index.copy()
200+
df["parsed_date"] = df["parsed_date"].dt.date
201+
202+
X_train = df[["parsed_date"]]
203+
y_train = df[["total_visits"]]
204+
205+
with pytest.warns(
206+
UserWarning,
207+
match="Converting Date column 'parsed_date' to datetime for hourly frequency.",
208+
):
209+
# Act
210+
model.fit(X_train, y_train)
211+
212+
# Assert
213+
assert model._bqml_model is not None

0 commit comments

Comments
 (0)