1414
1515
1616def test_bqml_getting_started (random_model_id ):
17- your_model_id = random_model_id
17+ your_model_id = random_model_id # for example: bqml_tutorial.sample_model
1818
1919 # [START bigquery_dataframes_bqml_getting_started_tutorial]
2020 from bigframes .ml .linear_model import LogisticRegression
@@ -26,17 +26,12 @@ def test_bqml_getting_started(random_model_id):
2626 # https://github.com/googleapis/python-bigquery-dataframes/issues/169
2727 # for updates to `read_gbq` to support wildcard tables.
2828
29- df = bpd .read_gbq (
30- """
31- -- Since the order of rows isn't useful for the model training,
32- -- generate a random ID to use as the index for the DataFrame.
33- SELECT GENERATE_UUID() AS rowindex, *
34- FROM
35- `bigquery-public-data.google_analytics_sample.ga_sessions_*`
36- WHERE
37- _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
38- """ ,
39- index_col = "rowindex" ,
29+ df = bpd .read_gbq_table (
30+ "bigquery-public-data.google_analytics_sample.ga_sessions_*" ,
31+ filters = [
32+ ("_table_suffix" , ">=" , "20160801" ),
33+ ("_table_suffix" , "<=" , "20170630" ),
34+ ],
4035 )
4136
4237 # Extract the total number of transactions within
@@ -53,14 +48,14 @@ def test_bqml_getting_started(random_model_id):
5348 # ecommerce transactions within the Google Analytics session.
5449 # If the number of transactions is NULL, the value in the label
5550 # column is set to 0. Otherwise, it is set to 1.
56- label = transactions .notnull ().map ({True : 1 , False : 0 })
51+ label = transactions .notnull ().map ({True : 1 , False : 0 }). rename ( "label" )
5752
5853 # Extract the operating system of the visitor's device.
59- operatingSystem = df ["device" ].struct .field ("operatingSystem" )
60- operatingSystem = operatingSystem .fillna ("" )
54+ operating_system = df ["device" ].struct .field ("operatingSystem" )
55+ operating_system = operating_system .fillna ("" )
6156
6257 # Extract whether the visitor's device is a mobile device.
63- isMobile = df ["device" ].struct .field ("isMobile" )
58+ is_mobile = df ["device" ].struct .field ("isMobile" )
6459
6560 # Extract the country from which the sessions originated, based on the IP address.
6661 country = df ["geoNetwork" ].struct .field ("country" ).fillna ("" )
@@ -72,8 +67,8 @@ def test_bqml_getting_started(random_model_id):
7267 # to use as training data.
7368 features = bpd .DataFrame (
7469 {
75- "os" : operatingSystem ,
76- "is_mobile" : isMobile ,
70+ "os" : operating_system ,
71+ "is_mobile" : is_mobile ,
7772 "country" : country ,
7873 "pageviews" : pageviews ,
7974 }
@@ -95,39 +90,36 @@ def test_bqml_getting_started(random_model_id):
9590 # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
9691 import bigframes .pandas as bpd
9792
98- # Select model you'll use for training . `read_gbq_model` loads model data from a
93+ # Select model you'll use for evaluating . `read_gbq_model` loads model data from a
9994 # BigQuery, but you could also use the `model` object from the previous steps.
10095 model = bpd .read_gbq_model (
10196 your_model_id , # For example: "bqml_tutorial.sample_model",
10297 )
10398
104- # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' —
105- # limits the number of tables scanned by the query. The date range scanned is
106- # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance
107- # of the model. It was collected in the month immediately following the time
108- # period spanned by the training data.
109-
110- df = bpd .read_gbq (
111- """
112- SELECT GENERATE_UUID() AS rowindex, *
113- FROM
114- `bigquery-public-data.google_analytics_sample.ga_sessions_*`
115- WHERE
116- _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'
117- """ ,
118- index_col = "rowindex" ,
99+ # The filters parameter limits the number of tables scanned by the query.
100+ # The date range scanned is July 1, 2017 to August 1, 2017. This is the
101+ # data you're using to evaluate the predictive performance of the model.
102+ # It was collected in the month immediately following the time period
103+ # spanned by the training data.
104+ df = bpd .read_gbq_table (
105+ "bigquery-public-data.google_analytics_sample.ga_sessions_*" ,
106+ filters = [
107+ ("_table_suffix" , ">=" , "20170701" ),
108+ ("_table_suffix" , "<=" , "20170801" ),
109+ ],
119110 )
111+
120112 transactions = df ["totals" ].struct .field ("transactions" )
121- label = transactions .notnull ().map ({True : 1 , False : 0 })
122- operatingSystem = df ["device" ].struct .field ("operatingSystem" )
123- operatingSystem = operatingSystem .fillna ("" )
124- isMobile = df ["device" ].struct .field ("isMobile" )
113+ label = transactions .notnull ().map ({True : 1 , False : 0 }). rename ( "label" )
114+ operating_system = df ["device" ].struct .field ("operatingSystem" )
115+ operating_system = operating_system .fillna ("" )
116+ is_mobile = df ["device" ].struct .field ("isMobile" )
125117 country = df ["geoNetwork" ].struct .field ("country" ).fillna ("" )
126118 pageviews = df ["totals" ].struct .field ("pageviews" ).fillna (0 )
127119 features = bpd .DataFrame (
128120 {
129- "os" : operatingSystem ,
130- "is_mobile" : isMobile ,
121+ "os" : operating_system ,
122+ "is_mobile" : is_mobile ,
131123 "country" : country ,
132124 "pageviews" : pageviews ,
133125 }
@@ -163,6 +155,143 @@ def test_bqml_getting_started(random_model_id):
163155 # [1 rows x 6 columns]
164156 # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
165157
166- # [START bigquery_dataframes_bqml_getting_started_tutorial_predict]
158+ # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]
159+ import bigframes .pandas as bpd
160+
161+ # Select model you'll use for predicting.
162+ # `read_gbq_model` loads model data from
163+ # BigQuery, but you could also use the `model`
164+ # object from the previous steps.
165+ model = bpd .read_gbq_model (
166+ your_model_id , # For example: "bqml_tutorial.sample_model",
167+ )
168+
169+ # The filters parameter limits the number of tables scanned by the query.
170+ # The date range scanned is July 1, 2017 to August 1, 2017. This is the
171+ # data you're using to make the prediction.
172+ # It was collected in the month immediately following the time period
173+ # spanned by the training data.
174+ df = bpd .read_gbq_table (
175+ "bigquery-public-data.google_analytics_sample.ga_sessions_*" ,
176+ filters = [
177+ ("_table_suffix" , ">=" , "20170701" ),
178+ ("_table_suffix" , "<=" , "20170801" ),
179+ ],
180+ )
181+
182+ operating_system = df ["device" ].struct .field ("operatingSystem" )
183+ operating_system = operating_system .fillna ("" )
184+ is_mobile = df ["device" ].struct .field ("isMobile" )
185+ country = df ["geoNetwork" ].struct .field ("country" ).fillna ("" )
186+ pageviews = df ["totals" ].struct .field ("pageviews" ).fillna (0 )
187+ features = bpd .DataFrame (
188+ {
189+ "os" : operating_system ,
190+ "is_mobile" : is_mobile ,
191+ "country" : country ,
192+ "pageviews" : pageviews ,
193+ }
194+ )
195+ # Use Logistic Regression predict method to predict results
196+ # using your model.
197+ # Find more information here in
198+ # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
199+
200+ predictions = model .predict (features )
201+
202+ # Call groupby method to group predicted_label by country.
203+ # Call sum method to get the total_predicted_label by country.
204+ total_predicted_purchases = predictions .groupby (["country" ])[
205+ ["predicted_label" ]
206+ ].sum ()
207+
208+ # Call the sort_values method with the parameter
209+ # ascending = False to get the highest values.
210+ # Call head method to limit to the 10 highest values.
211+ total_predicted_purchases .sort_values (ascending = False ).head (10 )
212+
213+ # country
214+ # United States 220
215+ # Taiwan 8
216+ # Canada 7
217+ # India 2
218+ # Japan 2
219+ # Turkey 2
220+ # Australia 1
221+ # Brazil 1
222+ # Germany 1
223+ # Guyana 1
224+ # Name: predicted_label, dtype: Int64
225+
226+ # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]
227+
228+ # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor]
229+
230+ import bigframes .pandas as bpd
231+
232+ # Select model you'll use for predicting.
233+ # `read_gbq_model` loads model data from
234+ # BigQuery, but you could also use the `model`
235+ # object from the previous steps.
236+ model = bpd .read_gbq_model (
237+ your_model_id , # For example: "bqml_tutorial.sample_model",
238+ )
239+
240+ # The filters parameter limits the number of tables scanned by the query.
241+ # The date range scanned is July 1, 2017 to August 1, 2017. This is the
242+ # data you're using to make the prediction.
243+ # It was collected in the month immediately following the time period
244+ # spanned by the training data.
245+ df = bpd .read_gbq_table (
246+ "bigquery-public-data.google_analytics_sample.ga_sessions_*" ,
247+ filters = [
248+ ("_table_suffix" , ">=" , "20170701" ),
249+ ("_table_suffix" , "<=" , "20170801" ),
250+ ],
251+ )
252+
253+ operating_system = df ["device" ].struct .field ("operatingSystem" )
254+ operating_system = operating_system .fillna ("" )
255+ is_mobile = df ["device" ].struct .field ("isMobile" )
256+ country = df ["geoNetwork" ].struct .field ("country" ).fillna ("" )
257+ pageviews = df ["totals" ].struct .field ("pageviews" ).fillna (0 )
258+ full_visitor_id = df ["fullVisitorId" ]
259+
260+ features = bpd .DataFrame (
261+ {
262+ "os" : operating_system ,
263+ "is_mobile" : is_mobile ,
264+ "country" : country ,
265+ "pageviews" : pageviews ,
266+ "fullVisitorId" : full_visitor_id ,
267+ }
268+ )
269+
270+ predictions = model .predict (features )
271+
272+ # Call groupby method to group predicted_label by visitor.
273+ # Call sum method to get the total_predicted_label by visitor.
274+ total_predicted_purchases = predictions .groupby (["fullVisitorId" ])[
275+ ["predicted_label" ]
276+ ].sum ()
277+
278+ # Call the sort_values method with the parameter
279+ # ascending = False to get the highest values.
280+ # Call head method to limit to the 10 highest values.
281+ total_predicted_purchases .sort_values (ascending = False ).head (10 )
282+
283+ # fullVisitorId
284+ # 9417857471295131045 4
285+ # 0376394056092189113 2
286+ # 0456807427403774085 2
287+ # 057693500927581077 2
288+ # 112288330928895942 2
289+ # 1280993661204347450 2
290+ # 2105122376016897629 2
291+ # 2158257269735455737 2
292+ # 2969418676126258798 2
293+ # 489038402765684003 2
294+ # Name: predicted_label, dtype: Int64
295+
167296
168- # [END bigquery_dataframes_bqml_getting_started_tutorial_predict ]
297+ # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor ]
0 commit comments