feat: add code samples for dbt bigframes integration

jialuoo · jialuoo · commit 2b6f3f54e91d · 2025-07-10T18:13:02.000Z
diff --git a/dbt_bigframes_integration/.dbt.yml b/dbt_bigframes_integration/.dbt.yml
@@ -0,0 +1,13 @@
+dbt_sample_project:
+  outputs:
+    dev:        # The target environment name (e.g., dev, prod)
+      compute_region: us-central1         # Region used for compute operations
+      dataset: dbt_sample_dateset         # BigQuery dataset where dbt will create models
+      gcs_bucket: dbt_sample_bucket       # GCS bucket to store output files
+      location: US                        # BigQuery dataset location
+      method: oauth                       # Authentication method
+      priority: interactive               # Job priority: "interactive" or "batch"
+      project: bigframes-dev              # GCP project ID
+      threads: 1                          # Number of threads dbt can use for running models in parallel
+      type: bigquery                      # Specifies the dbt adapter
+  target: dev   # The default target environment
diff --git a/dbt_bigframes_integration/dbt_sample_project/dbt_project.yml b/dbt_bigframes_integration/dbt_sample_project/dbt_project.yml
@@ -0,0 +1,39 @@
+
+# Name your project! Project names should contain only lowercase characters
+# and underscores. A good package name should reflect your organization's
+# name or the intended use of these models
+name: 'dbt_sample_project'
+version: '1.0.0'
+
+# This setting configures which "profile" dbt uses for this project.
+profile: 'dbt_sample_project'
+
+# These configurations specify where dbt should look for different types of files.
+# The `model-paths` config, for example, states that models in this project can be
+# found in the "models/" directory. You probably won't need to change these!
+model-paths: ["models"]
+analysis-paths: ["analyses"]
+test-paths: ["tests"]
+seed-paths: ["seeds"]
+macro-paths: ["macros"]
+snapshot-paths: ["snapshots"]
+
+clean-targets:         # directories to be removed by `dbt clean`
+  - "target"
+  - "dbt_packages"
+
+
+# Configuring models
+# Full documentation: https://docs.getdbt.com/docs/configuring-models
+
+# In this example config, we tell dbt to build all models in the example/
+# directory as views. These settings can be overridden in the individual model
+# files using the `{{ config(...) }}` macro.
+models:
+  dbt_sample_project:
+    # Optional: These settings (e.g., submission_method, notebook_template_id,
+    # etc.) can also be defined directly in the Python model using dbt.config.
+    submission_method: bigframes
+    # Config indicated by + and applies to all files under models/example/
+    example:
+      +materialized: view
diff --git a/dbt_bigframes_integration/dbt_sample_project/models/example/dbt_bigframes_code_sample_1.py b/dbt_bigframes_integration/dbt_sample_project/models/example/dbt_bigframes_code_sample_1.py
@@ -0,0 +1,48 @@
+# This example demonstrates one of the most general usage of transforming raw
+# BigQuery data into a processed table using dbt in BigFrames mode.
+#
+# Key defaults when using BigFrames in dbt:
+# - The default materialization is 'table' unless specified otherwise.
+# - The default timeout for the job is 3600 seconds (60 minutes).
+# - If no runtime template is provided, dbt will automatically create and reuse
+#   a default one.
+#
+# This code sample shows a basic pattern for reading a BigQuery public dataset,
+# processing it using pandas-like operations, and outputting a cleaned table.
+
+
+def model(dbt, session):
+    # Optional: override settings from dbt_project.yml. When both are set,
+    # dbt.config takes precedence over dbt_project.yml.
+    # Use BigFrames mode to execute the Python model.
+    dbt.config(submission_method="bigframes")
+
+    # Define the BigQuery table path from which to read data.
+    table = "bigquery-public-data.epa_historical_air_quality.temperature_hourly_summary"
+
+    # Define the specific columns to select from the BigQuery table.
+    columns = ["state_name", "county_name", "date_local", "time_local", "sample_measurement"]
+
+    # Read data from the specified BigQuery table into a BigFrames DataFrame.
+    # BigFrames allows you to interact with BigQuery tables using a pandas-like API.
+    df = session.read_gbq(table, columns=columns)
+
+    # Sort the DataFrame by the specified columns. This prepares the data for
+    # `drop_duplicates` to ensure consistent duplicate removal.
+    df = df.sort_values(columns).drop_duplicates(columns)
+
+    # Group the DataFrame by 'state_name', 'county_name', and 'date_local'. For
+    # each group, calculate the minimum and maximum of the 'sample_measurement'
+    # column. The result will be a BigFrames DataFrame with a MultiIndex.
+    result = df.groupby(["state_name", "county_name", "date_local"])["sample_measurement"]\
+        .agg(["min", "max"])
+
+    # Rename some columns and convert the MultiIndex of the 'result' DataFrame
+    # into regular columns. This flattens the DataFrame so 'state_name',
+    # 'county_name', and 'date_local' become regular columns again.
+    result = result.rename(columns={'min': 'min_temperature', 'max': 'max_temperature'})\
+        .reset_index()
+
+    # Return the processed BigFrames DataFrame.
+    # In a dbt Python model, this DataFrame will be materialized as a table
+    return result
diff --git a/dbt_bigframes_integration/dbt_sample_project/models/example/dbt_bigframes_code_sample_2.py b/dbt_bigframes_integration/dbt_sample_project/models/example/dbt_bigframes_code_sample_2.py
@@ -0,0 +1,52 @@
+# This example demonstrates how to build an incremental model.
+#
+# It applies lightweight, row-level logic to update or insert records into a
+# target BigQuery table. If the target table already exists, dbt will perform a
+# merge based on the specified unique keys; otherwise, it will create a new
+# table automatically.
+#
+# It also defines and applies a BigFrames UDF to add a descriptive summary
+# column based on temperature data.
+
+
+import bigframes.pandas as bpd
+
+def model(dbt, session):
+    # Optional: override settings from dbt_project.yml.
+    # When both are set, dbt.config takes precedence over dbt_project.yml.
+    dbt.config(
+        # Use BigFrames mode to execute the Python model.
+        submission_method="bigframes",
+        # Materialize as an incremental model.
+        materialized='incremental',
+        # Use MERGE strategy to update rows during incremental runs.
+        incremental_strategy='merge',
+        # Composite key to match existing rows for updates.
+        unique_key=["state_name", "county_name", "date_local"],
+    )
+
+    # Reference an upstream dbt model or table as a DataFrame input.
+    df = dbt.ref("dbt_bigframes_code_sample_1")
+
+    # Define a BigFrames UDF to generate a temperature description.
+    @bpd.udf(dataset='dbt_sample_dataset', name='describe_udf')
+    def describe(
+        max_temperature: float,
+        min_temperature: float,
+    ) -> str:
+        is_hot = max_temperature > 85.0
+        is_cold = min_temperature < 50.0
+
+        if is_hot and is_cold:
+            return "Expect both hot and cold conditions today."
+        if is_hot:
+            return "Overall, it's a hot day."
+        if is_cold:
+            return "Overall, it's a cold day."
+        return "Comfortable throughout the day."
+
+    # Apply the UDF using combine and store the result in a column "describe".
+    df["describe"] = df["max_temperature"].combine(df["min_temperature"], describe)
+
+    # Return the transformed DataFrame as the final dbt model output.
+    return df