From c6e277e4d055947bfc6bd1078d0d400dc3c1c25c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 24 Jul 2025 21:54:01 +0000 Subject: [PATCH 1/3] chore: Migrate up to 15 scalar operators to SQLGlot Migrated the following unary scalar operators to SQLGlot: - StrftimeOp - UnixSeconds - UnixMicros - UnixMillis - FloorDtOp - geo_st_boundary_op - geo_st_geogfromtext_op - geo_st_isclosed_op - GeoStLengthOp - StructFieldOp - AsTypeOp - IsInOp - ToDatetimeOp - ToTimestampOp - ToTimedeltaOp --- .../sqlglot/expressions/unary_compiler.py | 75 +++++++++++++ .../test_unary_compiler/test_floor_dt/out.sql | 13 +++ .../test_geo_st_boundary/out.sql | 13 +++ .../test_geo_st_geogfromtext/out.sql | 13 +++ .../test_geo_st_isclosed/out.sql | 13 +++ .../test_geo_st_length/out.sql | 13 +++ .../test_unary_compiler/test_is_in/out.sql | 13 +++ .../test_unary_compiler/test_strftime/out.sql | 13 +++ .../test_to_datetime/out.sql | 13 +++ .../test_to_timedelta/out.sql | 13 +++ .../test_to_timestamp/out.sql | 13 +++ .../test_unix_micros/out.sql | 13 +++ .../test_unix_millis/out.sql | 13 +++ .../test_unix_seconds/out.sql | 13 +++ .../expressions/test_unary_compiler.py | 101 ++++++++++++++++++ 15 files changed, 345 insertions(+) create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_floor_dt/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_boundary/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_geogfromtext/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_isclosed/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_length/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_strftime/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_datetime/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timestamp/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_micros/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_millis/out.sql create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_seconds/out.sql diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index 609ac374b6..844193b8b7 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -105,6 +105,11 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: ) +@UNARY_OP_REGISTRATION.register(ops.AsTypeOp) +def _(op: ops.AsTypeOp, expr: TypedExpr) -> sge.Expression: + return sge.Cast(this=expr.expr, to=op.to_type) + + @UNARY_OP_REGISTRATION.register(ops.ArrayToStringOp) def _(op: ops.ArrayToStringOp, expr: TypedExpr) -> sge.Expression: return sge.ArrayToString(this=expr.expr, expression=f"'{op.delimiter}'") @@ -234,6 +239,11 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: ) - sge.convert(1) +@UNARY_OP_REGISTRATION.register(ops.FloorDtOp) +def _(op: ops.FloorDtOp, expr: TypedExpr) -> sge.Expression: + return sge.TimestampTrunc(this=expr.expr, unit=sge.Identifier(this=op.freq)) + + @UNARY_OP_REGISTRATION.register(ops.floor_op) def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return sge.Floor(this=expr.expr) @@ -249,6 +259,26 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return sge.func("ST_ASTEXT", expr.expr) +@UNARY_OP_REGISTRATION.register(ops.geo_st_boundary_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("ST_BOUNDARY", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.geo_st_geogfromtext_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("SAFE.ST_GEOGFROMTEXT", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.geo_st_isclosed_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("ST_ISCLOSED", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.GeoStLengthOp) +def _(op: ops.GeoStLengthOp, expr: TypedExpr) -> sge.Expression: + return sge.func("ST_LENGTH", expr.expr) + + @UNARY_OP_REGISTRATION.register(ops.geo_x_op) def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return sge.func("SAFE.ST_X", expr.expr) @@ -274,6 +304,11 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return sge.BitwiseNot(this=expr.expr) +@UNARY_OP_REGISTRATION.register(ops.IsInOp) +def _(op: ops.IsInOp, expr: TypedExpr) -> sge.Expression: + return sge.In(this=expr.expr, expressions=[sge.convert(v) for v in op.values]) + + @UNARY_OP_REGISTRATION.register(ops.isalnum_op) def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return sge.RegexpLike(this=expr.expr, expression=sge.convert(r"^(\p{N}|\p{L})+$")) @@ -517,6 +552,16 @@ def _(op: ops.StrSliceOp, expr: TypedExpr) -> sge.Expression: ) +@UNARY_OP_REGISTRATION.register(ops.StrftimeOp) +def _(op: ops.StrftimeOp, expr: TypedExpr) -> sge.Expression: + return sge.func("FORMAT_TIMESTAMP", sge.convert(op.date_format), expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.StructFieldOp) +def _(op: ops.StructFieldOp, expr: TypedExpr) -> sge.Expression: + return sge.StructExtract(this=expr.expr, expression=sge.convert(op.name_or_index)) + + @UNARY_OP_REGISTRATION.register(ops.tan_op) def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return sge.func("TAN", expr.expr) @@ -537,6 +582,36 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return sge.Floor(this=expr.expr) +@UNARY_OP_REGISTRATION.register(ops.ToDatetimeOp) +def _(op: ops.ToDatetimeOp, expr: TypedExpr) -> sge.Expression: + return sge.Cast(this=sge.func("TIMESTAMP_SECONDS", expr.expr), to="DATETIME") + + +@UNARY_OP_REGISTRATION.register(ops.ToTimestampOp) +def _(op: ops.ToTimestampOp, expr: TypedExpr) -> sge.Expression: + return sge.func("TIMESTAMP_SECONDS", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.ToTimedeltaOp) +def _(op: ops.ToTimedeltaOp, expr: TypedExpr) -> sge.Expression: + return sge.Interval(this=expr.expr, unit=sge.Identifier(this="SECOND")) + + +@UNARY_OP_REGISTRATION.register(ops.UnixMicros) +def _(op: ops.UnixMicros, expr: TypedExpr) -> sge.Expression: + return sge.func("UNIX_MICROS", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.UnixMillis) +def _(op: ops.UnixMillis, expr: TypedExpr) -> sge.Expression: + return sge.func("UNIX_MILLIS", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.UnixSeconds) +def _(op: ops.UnixSeconds, expr: TypedExpr) -> sge.Expression: + return sge.func("UNIX_SECONDS", expr.expr) + + # JSON Ops @UNARY_OP_REGISTRATION.register(ops.JSONExtract) def _(op: ops.JSONExtract, expr: TypedExpr) -> sge.Expression: diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_floor_dt/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_floor_dt/out.sql new file mode 100644 index 0000000000..3c7efd3098 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_floor_dt/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + TIMESTAMP_TRUNC(`bfcol_0`, DAY) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_boundary/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_boundary/out.sql new file mode 100644 index 0000000000..31c0b45034 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_boundary/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `geography_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ST_BOUNDARY(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `geography_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_geogfromtext/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_geogfromtext/out.sql new file mode 100644 index 0000000000..ba4d9dd182 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_geogfromtext/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + SAFE.ST_GEOGFROMTEXT(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_isclosed/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_isclosed/out.sql new file mode 100644 index 0000000000..d905e8470b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_isclosed/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `geography_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ST_ISCLOSED(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `geography_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_length/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_length/out.sql new file mode 100644 index 0000000000..a023691d63 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_geo_st_length/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `geography_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ST_LENGTH(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `geography_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql new file mode 100644 index 0000000000..36941df71b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` IN (1, 2, 3) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_strftime/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_strftime/out.sql new file mode 100644 index 0000000000..077c30e7cb --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_strftime/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + FORMAT_TIMESTAMP('%Y-%m-%d', `bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_datetime/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_datetime/out.sql new file mode 100644 index 0000000000..096f14cc85 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_datetime/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CAST(TIMESTAMP_SECONDS(`bfcol_0`) AS DATETIME) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql new file mode 100644 index 0000000000..b89056d65f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + INTERVAL `bfcol_0` SECOND AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timestamp/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timestamp/out.sql new file mode 100644 index 0000000000..b1e66ce3e7 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timestamp/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + TIMESTAMP_SECONDS(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_micros/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_micros/out.sql new file mode 100644 index 0000000000..dcbf0be5c2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_micros/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + UNIX_MICROS(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_millis/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_millis/out.sql new file mode 100644 index 0000000000..ca58fbc97c --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_millis/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + UNIX_MILLIS(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_seconds/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_seconds/out.sql new file mode 100644 index 0000000000..21f0b7b8c8 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_unix_seconds/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + UNIX_SECONDS(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py index 236f94045f..5cef982196 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -139,6 +139,13 @@ def test_expm1(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_floor_dt(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.FloorDtOp("DAY"), "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + def test_floor(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["float64_col"]] sql = _apply_unary_op(bf_df, ops.floor_op, "float64_col") @@ -160,6 +167,34 @@ def test_geo_st_astext(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_geo_st_boundary(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["geography_col"]] + sql = _apply_unary_op(bf_df, ops.geo_st_boundary_op, "geography_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_geo_st_geogfromtext(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.geo_st_geogfromtext_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_geo_st_isclosed(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["geography_col"]] + sql = _apply_unary_op(bf_df, ops.geo_st_isclosed_op, "geography_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_geo_st_length(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["geography_col"]] + sql = _apply_unary_op(bf_df, ops.GeoStLengthOp(True), "geography_col") + + snapshot.assert_match(sql, "out.sql") + + def test_geo_x(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["geography_col"]] sql = _apply_unary_op(bf_df, ops.geo_x_op, "geography_col") @@ -237,6 +272,13 @@ def test_invert(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + sql = _apply_unary_op(bf_df, ops.IsInOp(values=(1, 2, 3)), "int64_col") + + snapshot.assert_match(sql, "out.sql") + + def test_isalnum(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["string_col"]] sql = _apply_unary_op(bf_df, ops.isalnum_op, "string_col") @@ -419,6 +461,23 @@ def test_str_slice(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_strftime(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.StrftimeOp("%Y-%m-%d"), "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_struct_field(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + bf_df["struct_col"] = bpd.DataFrame( + {"field1": bf_df["int64_col"], "field2": bf_df["int64_col"] * 2} + ).to_struct() + sql = _apply_unary_op(bf_df, ops.StructFieldOp("field1"), "struct_col") + + snapshot.assert_match(sql, "out_sql") + + def test_str_contains(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["string_col"]] sql = _apply_unary_op(bf_df, ops.StrContainsOp("e"), "string_col") @@ -510,6 +569,48 @@ def test_time(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_to_datetime(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + sql = _apply_unary_op(bf_df, ops.ToDatetimeOp(), "int64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_to_timestamp(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + sql = _apply_unary_op(bf_df, ops.ToTimestampOp(), "int64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_to_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + sql = _apply_unary_op(bf_df, ops.ToTimedeltaOp("s"), "int64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_unix_micros(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.UnixMicros(), "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_unix_millis(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.UnixMillis(), "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_unix_seconds(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.UnixSeconds(), "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + def test_timedelta_floor(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col"]] sql = _apply_unary_op(bf_df, ops.timedelta_floor_op, "int64_col") From 42ec975c2588c3f58b99b4ec089ed130f4a48ac0 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 30 Jul 2025 20:07:04 +0000 Subject: [PATCH 2/3] add TODO for some ops --- bigframes/core/compile/sqlglot/expressions/unary_compiler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index 844193b8b7..2104d9c4d1 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -107,6 +107,7 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: @UNARY_OP_REGISTRATION.register(ops.AsTypeOp) def _(op: ops.AsTypeOp, expr: TypedExpr) -> sge.Expression: + # TODO: Support more types for casting, such as JSON, etc. return sge.Cast(this=expr.expr, to=op.to_type) @@ -241,6 +242,7 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: @UNARY_OP_REGISTRATION.register(ops.FloorDtOp) def _(op: ops.FloorDtOp, expr: TypedExpr) -> sge.Expression: + # TODO: Remove this method when it is covered by ops.FloorOp return sge.TimestampTrunc(this=expr.expr, unit=sge.Identifier(this=op.freq)) From 4f0b7bd8f2e4075d0c2415c58ae807668dccb082 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 30 Jul 2025 20:35:01 +0000 Subject: [PATCH 3/3] fix struct field op --- .../sqlglot/expressions/unary_compiler.py | 14 +++++++++++++- .../test_struct_field/out.sql | 13 +++++++++++++ .../sqlglot/expressions/test_unary_compiler.py | 16 +++++++++------- 3 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_struct_field/out.sql diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index 2104d9c4d1..125c60bbf4 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -16,6 +16,8 @@ import typing +import pandas as pd +import pyarrow as pa import sqlglot import sqlglot.expressions as sge @@ -561,7 +563,17 @@ def _(op: ops.StrftimeOp, expr: TypedExpr) -> sge.Expression: @UNARY_OP_REGISTRATION.register(ops.StructFieldOp) def _(op: ops.StructFieldOp, expr: TypedExpr) -> sge.Expression: - return sge.StructExtract(this=expr.expr, expression=sge.convert(op.name_or_index)) + if isinstance(op.name_or_index, str): + name = op.name_or_index + else: + pa_type = typing.cast(pd.ArrowDtype, expr.dtype) + pa_struct_type = typing.cast(pa.StructType, pa_type.pyarrow_dtype) + name = pa_struct_type.field(op.name_or_index).name + + return sge.Column( + this=sge.to_identifier(name, quoted=True), + catalog=expr.expr, + ) @UNARY_OP_REGISTRATION.register(ops.tan_op) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_struct_field/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_struct_field/out.sql new file mode 100644 index 0000000000..b3e8fde0b2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_struct_field/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `people` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`nested_structs_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0`.`name` AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `people` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py index 5cef982196..0a930d68ae 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -468,14 +468,16 @@ def test_strftime(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") -def test_struct_field(scalar_types_df: bpd.DataFrame, snapshot): - bf_df = scalar_types_df[["int64_col"]] - bf_df["struct_col"] = bpd.DataFrame( - {"field1": bf_df["int64_col"], "field2": bf_df["int64_col"] * 2} - ).to_struct() - sql = _apply_unary_op(bf_df, ops.StructFieldOp("field1"), "struct_col") +def test_struct_field(nested_structs_types_df: bpd.DataFrame, snapshot): + bf_df = nested_structs_types_df[["people"]] + + # When a name string is provided. + sql = _apply_unary_op(bf_df, ops.StructFieldOp("name"), "people") + snapshot.assert_match(sql, "out.sql") - snapshot.assert_match(sql, "out_sql") + # When an index integer is provided. + sql = _apply_unary_op(bf_df, ops.StructFieldOp(0), "people") + snapshot.assert_match(sql, "out.sql") def test_str_contains(scalar_types_df: bpd.DataFrame, snapshot):