@@ -18,10 +18,14 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
1818 your_model_id = random_model_id_eu
1919 # [START bigquery_dataframes_bqml_kmeans]
2020 import datetime
21+ import typing
2122
2223 import pandas as pd
24+ from shapely .geometry import Point
2325
2426 import bigframes
27+ import bigframes .bigquery as bbq
28+ import bigframes .geopandas
2529 import bigframes .pandas as bpd
2630
2731 bigframes .options .bigquery .project = your_gcp_project_id
@@ -41,21 +45,21 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
4145 }
4246 )
4347
44- s = bpd .read_gbq (
45- # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical
46- # data. These functions determine spatial relationships between
47- # geographical features.
48- """
49- SELECT
50- id,
51- ST_DISTANCE(
52- ST_GEOGPOINT(s.longitude, s.latitude),
53- ST_GEOGPOINT(-0.1, 51.5)
54- ) / 1000 AS distance_from_city_center
55- FROM
56- `bigquery-public-data.london_bicycles.cycle_stations` s
57- """
48+ # Use GeoSeries.from_xy and BigQuery.st_distance to analyze geographical
49+ # data. These functions determine spatial relationships between
50+ # geographical features.
51+
52+ cycle_stations = bpd .read_gbq ("bigquery-public-data.london_bicycles.cycle_stations" )
53+ s = bpd .DataFrame (
54+ {
55+ "id" : cycle_stations ["id" ],
56+ "xy" : bigframes .geopandas .GeoSeries .from_xy (
57+ cycle_stations ["longitude" ], cycle_stations ["latitude" ]
58+ ),
59+ }
5860 )
61+ s_distance = bbq .st_distance (s ["xy" ], Point (- 0.1 , 51.5 ), use_spheroid = False ) / 1000
62+ s = bpd .DataFrame ({"id" : s ["id" ], "distance_from_city_center" : s_distance })
5963
6064 # Define Python datetime objects in the UTC timezone for range comparison,
6165 # because BigQuery stores timestamp data in the UTC timezone.
@@ -91,8 +95,11 @@ def test_kmeans_sample(project_id: str, random_model_id_eu: str) -> None:
9195
9296 # Engineer features to cluster the stations. For each station, find the
9397 # average trip duration, number of trips, and distance from city center.
94- stationstats = merged_df .groupby (["station_name" , "isweekday" ]).agg (
95- {"duration" : ["mean" , "count" ], "distance_from_city_center" : "max" }
98+ stationstats = typing .cast (
99+ bpd .DataFrame ,
100+ merged_df .groupby (["station_name" , "isweekday" ]).agg (
101+ {"duration" : ["mean" , "count" ], "distance_from_city_center" : "max" }
102+ ),
96103 )
97104 stationstats .columns = pd .Index (
98105 ["duration" , "num_trips" , "distance_from_city_center" ]
0 commit comments