From 30e1eb64ff2c0a4ac07ddd82625806554910579b Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 13 Jun 2025 21:55:57 +0000 Subject: [PATCH 01/18] Refactor IsNullOp and NotNullOp logic This change consolidates the definition and compilation logic for IsNullOp, isnull_op, NotNullOp, and notnull_op into a new, dedicated file: `bigframes/operations/isnull_op.py`. Key changes include: - Moved operator definitions from `generic_ops.py` to `isnull_op.py`. - Moved Ibis scalar compilation logic from `scalar_op_compiler.py` to `isnull_op.py`. - Moved Polars expression compilation logic from `polars/compiler.py` to `isnull_op.py`. - Updated main compilers (`ScalarOpCompiler` and `PolarsExpressionCompiler`) to directly import and register the compilation functions from `isnull_op.py`. - Ensured all internal references and naming conventions (`IsNullOp`, `isnull_op`, `NotNullOp`, `notnull_op`) are consistent with the refactored structure. NOTE: I was unable to perform test validation (unit and system) due to missing project-specific dependencies, primarily `bigframes_vendored` and `test_utils.prefixer`. The changes are provided based on the completion of the refactoring steps as you requested. --- bigframes/core/compile/polars/compiler.py | 18 ++--- bigframes/core/compile/scalar_op_compiler.py | 21 +++--- bigframes/dataframe.py | 1 + bigframes/operations/__init__.py | 3 +- bigframes/operations/generic_ops.py | 16 ----- bigframes/operations/isnull_op.py | 76 ++++++++++++++++++++ 6 files changed, 101 insertions(+), 34 deletions(-) create mode 100644 bigframes/operations/isnull_op.py diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 62654c1518..6bfc2dda53 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -231,14 +231,6 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: else: return input.is_in(op.values) or input.is_null() - @compile_op.register(gen_ops.IsNullOp) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.is_null() - - @compile_op.register(gen_ops.NotNullOp) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.is_not_null() - @compile_op.register(gen_ops.FillNaOp) @compile_op.register(gen_ops.CoalesceOp) def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: @@ -268,6 +260,16 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: # eg. We want "True" instead of "true" for bool to strin return input.cast(_DTYPE_MAPPING[op.to_type], strict=not op.safe) + # Register ops from other modules + from bigframes.operations import IsNullOp, NotNullOp + from bigframes.operations.isnull_op import ( + _polars_isnull_op_impl, + _polars_notnull_op_impl, + ) + + PolarsExpressionCompiler.compile_op.register(IsNullOp, _polars_isnull_op_impl) + PolarsExpressionCompiler.compile_op.register(NotNullOp, _polars_notnull_op_impl) + @dataclasses.dataclass(frozen=True) class PolarsAggregateCompiler: scalar_compiler = PolarsExpressionCompiler() diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index b819b1c4e2..3d82ef0033 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -228,18 +228,23 @@ def _register( # Singleton compiler scalar_op_compiler = ScalarOpCompiler() +# Registrations for operations defined in isnull_op.py +from bigframes.operations import isnull_op, notnull_op +from bigframes.operations.isnull_op import ( + _ibis_isnull_op_impl, + _ibis_notnull_op_impl, +) -### Unary Ops -@scalar_op_compiler.register_unary_op(ops.isnull_op) -def isnull_op_impl(x: ibis_types.Value): - return x.isnull() - +@scalar_op_compiler.register_unary_op(isnull_op) +def _scalar_isnull_op_impl_wrapper(x: ibis_types.Value): + return _ibis_isnull_op_impl(x) -@scalar_op_compiler.register_unary_op(ops.notnull_op) -def notnull_op_impl(x: ibis_types.Value): - return x.notnull() +@scalar_op_compiler.register_unary_op(notnull_op) +def _scalar_notnull_op_impl_wrapper(x: ibis_types.Value): + return _ibis_notnull_op_impl(x) +### Unary Ops @scalar_op_compiler.register_unary_op(ops.hash_op) def hash_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.IntegerValue, x).hash() diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7e5bb3049a..81505ce131 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2521,6 +2521,7 @@ def _filter_rows( elif items is not None: # Behavior matches pandas 2.1+, older pandas versions would reindex block = self._block + block = self._block block, mask_id = block.apply_unary_op( self._block.index_columns[0], ops.IsInOp(values=tuple(items)) ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 291bf17fa5..9fc0a93fff 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -81,11 +81,9 @@ hash_op, invert_op, IsInOp, - isnull_op, MapOp, maximum_op, minimum_op, - notnull_op, RowKey, SqlScalarOp, where_op, @@ -104,6 +102,7 @@ GeoStDistanceOp, GeoStLengthOp, ) +from bigframes.operations.isnull_op import isnull_op, notnull_op from bigframes.operations.json_ops import ( JSONExtract, JSONExtractArray, diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py index 3c3f9653b4..a8e876da0d 100644 --- a/bigframes/operations/generic_ops.py +++ b/bigframes/operations/generic_ops.py @@ -29,22 +29,6 @@ ) invert_op = InvertOp() -IsNullOp = base_ops.create_unary_op( - name="isnull", - type_signature=op_typing.FixedOutputType( - lambda x: True, dtypes.BOOL_DTYPE, description="nullable" - ), -) -isnull_op = IsNullOp() - -NotNullOp = base_ops.create_unary_op( - name="notnull", - type_signature=op_typing.FixedOutputType( - lambda x: True, dtypes.BOOL_DTYPE, description="nullable" - ), -) -notnull_op = NotNullOp() - HashOp = base_ops.create_unary_op( name="hash", type_signature=op_typing.FixedOutputType( diff --git a/bigframes/operations/isnull_op.py b/bigframes/operations/isnull_op.py new file mode 100644 index 0000000000..0579d7ac1c --- /dev/null +++ b/bigframes/operations/isnull_op.py @@ -0,0 +1,76 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +# Direct imports from bigframes +from bigframes import dtypes +from bigframes.operations import base_ops +import bigframes.operations.type as op_typing + +# Imports for Ibis compilation +from bigframes_vendored.ibis.expr import types as ibis_types + +# Imports for Polars compilation +try: + import polars as pl +except ImportError: + # Polars is optional, error will be raised elsewhere if user tries to use it. + pass + + +# Definitions of IsNullOp and NotNullOp operations +IsNullOp = base_ops.create_unary_op( + name="isnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +isnull_op = IsNullOp() + +NotNullOp = base_ops.create_unary_op( + name="notnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +notnull_op = NotNullOp() + +# Ibis Scalar Op Implementations +def _ibis_isnull_op_impl(x: ibis_types.Value): + return x.isnull() + +def _ibis_notnull_op_impl(x: ibis_types.Value): + return x.notnull() + + +# Polars Expression Implementations +def _polars_isnull_op_impl(op: IsNullOp, input: pl.Expr) -> pl.Expr: + return input.is_null() + +def _polars_notnull_op_impl(op: NotNullOp, input: pl.Expr) -> pl.Expr: + return input.is_not_null() + +__all__ = [ + "IsNullOp", + "isnull_op", + "NotNullOp", + "notnull_op", + "_ibis_isnull_op_impl", + "_ibis_notnull_op_impl", + "_polars_isnull_op_impl", + "_polars_notnull_op_impl", +] From 1b711aae2d6d63c0901e5b123f9efca0a9c67b5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 8 Jul 2025 10:01:12 -0700 Subject: [PATCH 02/18] fix circular imports --- bigframes/core/array_value.py | 8 +- bigframes/core/compile/compiled.py | 11 +- bigframes/core/compile/compiler.py | 5 +- bigframes/core/compile/polars/compiler.py | 10 - bigframes/core/compile/scalar_op_compiler.py | 2081 +--------------- bigframes/core/compile/scalar_op_registry.py | 2086 +++++++++++++++++ bigframes/core/expression.py | 9 +- bigframes/operations/__init__.py | 3 +- .../__init__.py} | 0 bigframes/operations/generic_ops/isnull_op.py | 68 + .../operations/generic_ops/notnull_op.py | 72 + bigframes/operations/isnull_op.py | 76 - 12 files changed, 2257 insertions(+), 2172 deletions(-) create mode 100644 bigframes/core/compile/scalar_op_registry.py rename bigframes/operations/{generic_ops.py => generic_ops/__init__.py} (100%) create mode 100644 bigframes/operations/generic_ops/isnull_op.py create mode 100644 bigframes/operations/generic_ops/notnull_op.py delete mode 100644 bigframes/operations/isnull_op.py diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index b47637cb59..968add8784 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -37,10 +37,10 @@ from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.exceptions as bfe -import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops if typing.TYPE_CHECKING: + # Avoid circular imports. + import bigframes.operations.aggregations as agg_ops from bigframes.session import Session ORDER_ID_COLUMN = "bigframes_ordering_id" @@ -185,6 +185,8 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" + import bigframes.operations.aggregations as agg_ops # Avoid circular imports. + return ArrayValue( nodes.AggregateNode( child=self.node, @@ -200,6 +202,8 @@ def row_count(self) -> ArrayValue: # Operations def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + import bigframes.operations as ops # Avoid circular imports. + predicate: ex.Expression = ex.deref(predicate_id) if keep_null: predicate = ops.fillna_op.as_expr(predicate, ex.const(True)) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 314b54fc6d..3245d68fdf 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -34,7 +34,6 @@ import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler as op_compilers -import bigframes.core.compile.scalar_op_compiler as scalar_op_compiler import bigframes.core.expression as ex from bigframes.core.ordering import OrderingExpression import bigframes.core.sql @@ -45,6 +44,12 @@ op_compiler = op_compilers.scalar_op_compiler +# This must be the last import. Currently depending on side-effects. +# TODO(tswast): Refactor all ops to register in the same file as where they are +# defined so we don't need this. +import bigframes.core.compile.scalar_op_registry # noqa: F401,E402 + + # Ibis Implementations class UnorderedIR: def __init__( @@ -679,13 +684,15 @@ def _join_condition( def _as_groupable(value: ibis_types.Value): + from bigframes.core.compile import scalar_op_registry + # Some types need to be converted to another type to enable groupby if value.type().is_float64(): return value.cast(ibis_dtypes.str) elif value.type().is_geospatial(): return typing.cast(ibis_types.GeoSpatialColumn, value).as_binary() elif value.type().is_json(): - return scalar_op_compiler.to_json_string(value) + return scalar_op_registry.to_json_string(value) else: return value diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 0efbd47ae4..3f5e7ba533 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -29,7 +29,6 @@ import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.configs as configs import bigframes.core.compile.explode -import bigframes.core.compile.scalar_op_compiler as compile_scalar import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering import bigframes.core.rewrite as rewrites @@ -178,6 +177,8 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args): @_compile_node.register def compile_readtable(node: nodes.ReadTableNode, *args): + from bigframes.core.compile import scalar_op_registry + ibis_table = _table_to_ibis( node.source, scan_cols=[col.source_id for col in node.scan_list.items] ) @@ -188,7 +189,7 @@ def compile_readtable(node: nodes.ReadTableNode, *args): scan_item.dtype == dtypes.JSON_DTYPE and ibis_table[scan_item.source_id].type() == ibis_dtypes.string ): - json_column = compile_scalar.parse_json( + json_column = scalar_op_registry.parse_json( ibis_table[scan_item.source_id] ).name(scan_item.source_id) ibis_table = ibis_table.mutate(json_column) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index f69542805c..9fb4131107 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -262,16 +262,6 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: # eg. We want "True" instead of "true" for bool to strin return input.cast(_DTYPE_MAPPING[op.to_type], strict=not op.safe) - # Register ops from other modules - from bigframes.operations import IsNullOp, NotNullOp - from bigframes.operations.isnull_op import ( - _polars_isnull_op_impl, - _polars_notnull_op_impl, - ) - - PolarsExpressionCompiler.compile_op.register(IsNullOp, _polars_isnull_op_impl) - PolarsExpressionCompiler.compile_op.register(NotNullOp, _polars_notnull_op_impl) - @dataclasses.dataclass(frozen=True) class PolarsAggregateCompiler: scalar_compiler = PolarsExpressionCompiler() diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 4506155e7b..d5f3e15d34 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -12,43 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""To avoid circular imports, this module should _not_ depend on any ops.""" + from __future__ import annotations import functools import typing +from typing import TYPE_CHECKING -import bigframes_vendored.ibis.expr.api as ibis_api -import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes -import bigframes_vendored.ibis.expr.operations.generic as ibis_generic -import bigframes_vendored.ibis.expr.operations.udf as ibis_udf import bigframes_vendored.ibis.expr.types as ibis_types -import numpy as np -import pandas as pd -from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS -import bigframes.core.compile.default_ordering import bigframes.core.compile.ibis_types import bigframes.core.expression as ex -import bigframes.dtypes -import bigframes.operations as ops - -_ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) -_NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) -_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) -_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) -# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result -# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) -# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. -_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) - -_OBJ_REF_STRUCT_SCHEMA = ( - ("uri", ibis_dtypes.String), - ("version", ibis_dtypes.String), - ("authorizer", ibis_dtypes.String), - ("details", ibis_dtypes.JSON), -) -_OBJ_REF_IBIS_DTYPE = ibis_dtypes.Struct.from_tuples(_OBJ_REF_STRUCT_SCHEMA) # type: ignore +if TYPE_CHECKING: + import bigframes.operations as ops class ScalarOpCompiler: @@ -227,2052 +205,3 @@ def _register( # Singleton compiler scalar_op_compiler = ScalarOpCompiler() - -# Registrations for operations defined in isnull_op.py -from bigframes.operations import isnull_op, notnull_op -from bigframes.operations.isnull_op import ( - _ibis_isnull_op_impl, - _ibis_notnull_op_impl, -) - -@scalar_op_compiler.register_unary_op(isnull_op) -def _scalar_isnull_op_impl_wrapper(x: ibis_types.Value): - return _ibis_isnull_op_impl(x) - -@scalar_op_compiler.register_unary_op(notnull_op) -def _scalar_notnull_op_impl_wrapper(x: ibis_types.Value): - return _ibis_notnull_op_impl(x) - - -### Unary Ops -@scalar_op_compiler.register_unary_op(ops.hash_op) -def hash_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.IntegerValue, x).hash() - - -# Trig Functions -@scalar_op_compiler.register_unary_op(ops.sin_op) -def sin_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).sin() - - -@scalar_op_compiler.register_unary_op(ops.cos_op) -def cos_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).cos() - - -@scalar_op_compiler.register_unary_op(ops.tan_op) -def tan_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).tan() - - -# Inverse trig functions -@scalar_op_compiler.register_unary_op(ops.arcsin_op) -def arcsin_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() <= _ibis_num(1) - return (~domain).ifelse(_NAN, numeric_value.asin()) - - -@scalar_op_compiler.register_unary_op(ops.arccos_op) -def arccos_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() <= _ibis_num(1) - return (~domain).ifelse(_NAN, numeric_value.acos()) - - -@scalar_op_compiler.register_unary_op(ops.arctan_op) -def arctan_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).atan() - - -@scalar_op_compiler.register_binary_op(ops.arctan2_op) -def arctan2_op_impl(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).atan2( - typing.cast(ibis_types.NumericValue, y) - ) - - -# Hyperbolic trig functions -# BQ has these functions, but Ibis doesn't -@scalar_op_compiler.register_unary_op(ops.sinh_op) -def sinh_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sinh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / _ibis_num(2) - domain = numeric_value.abs() < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) - - -@scalar_op_compiler.register_unary_op(ops.cosh_op) -def cosh_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - cosh_result = (numeric_value.exp() + (numeric_value.negate()).exp()) / _ibis_num(2) - domain = numeric_value.abs() < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF, cosh_result) - - -@scalar_op_compiler.register_unary_op(ops.tanh_op) -def tanh_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( - numeric_value.exp() + (numeric_value.negate()).exp() - ) - # Beyond +-20, is effectively just the sign function - domain = numeric_value.abs() < _ibis_num(20) - return (~domain).ifelse(numeric_value.sign(), tanh_result) - - -@scalar_op_compiler.register_unary_op(ops.arcsinh_op) -def arcsinh_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() - return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() - - -@scalar_op_compiler.register_unary_op(ops.arccosh_op) -def arccosh_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() - acosh_result = (numeric_value + sqrt_part).ln() - domain = numeric_value >= _ibis_num(1) - return (~domain).ifelse(_NAN, acosh_result) - - -@scalar_op_compiler.register_unary_op(ops.arctanh_op) -def arctanh_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value.abs() < _ibis_num(1) - numerator = numeric_value + _ibis_num(1) - denominator = _ibis_num(1) - numeric_value - ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) - atanh_result = ln_input.ln().div(2) - - out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( - _INF * numeric_value, _NAN - ) - - return (~domain).ifelse(out_of_domain, atanh_result) - - -# Numeric Ops -@scalar_op_compiler.register_unary_op(ops.floor_op) -def floor_op_impl(x: ibis_types.Value): - x_numeric = typing.cast(ibis_types.NumericValue, x) - if x_numeric.type().is_integer(): - return x_numeric.cast(ibis_dtypes.Float64()) - if x_numeric.type().is_floating(): - # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow - return float_floor(x_numeric) - else: # numeric - return x_numeric.floor() - - -@scalar_op_compiler.register_unary_op(ops.ceil_op) -def ceil_op_impl(x: ibis_types.Value): - x_numeric = typing.cast(ibis_types.NumericValue, x) - if x_numeric.type().is_integer(): - return x_numeric.cast(ibis_dtypes.Float64()) - if x_numeric.type().is_floating(): - # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow - return float_ceil(x_numeric) - else: # numeric - return x_numeric.ceil() - - -@scalar_op_compiler.register_unary_op(ops.abs_op) -def abs_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).abs() - - -@scalar_op_compiler.register_unary_op(ops.pos_op) -def pos_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x) - - -@scalar_op_compiler.register_unary_op(ops.neg_op) -def neg_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).negate() - - -@scalar_op_compiler.register_unary_op(ops.sqrt_op) -def sqrt_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value >= _ZERO - return (~domain).ifelse(_NAN, numeric_value.sqrt()) - - -@scalar_op_compiler.register_unary_op(ops.log10_op) -def log10_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value > _ZERO - out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) - return (~domain).ifelse(out_of_domain, numeric_value.log10()) - - -@scalar_op_compiler.register_unary_op(ops.ln_op) -def ln_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value > _ZERO - out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) - return (~domain).ifelse(out_of_domain, numeric_value.ln()) - - -@scalar_op_compiler.register_unary_op(ops.log1p_op) -def log1p_op_impl(x: ibis_types.Value): - return ln_op_impl(_ibis_num(1) + x) - - -@scalar_op_compiler.register_unary_op(ops.exp_op) -def exp_op_impl(x: ibis_types.Value): - numeric_value = typing.cast(ibis_types.NumericValue, x) - domain = numeric_value < _FLOAT64_EXP_BOUND - return (~domain).ifelse(_INF, numeric_value.exp()) - - -@scalar_op_compiler.register_unary_op(ops.expm1_op) -def expm1_op_impl(x: ibis_types.Value): - return exp_op_impl(x) - _ibis_num(1) - - -@scalar_op_compiler.register_unary_op(ops.invert_op) -def invert_op_impl(x: ibis_types.Value): - return x.__invert__() # type: ignore - - -## String Operation -@scalar_op_compiler.register_unary_op(ops.len_op) -def len_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).length().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.reverse_op) -def reverse_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).reverse() - - -@scalar_op_compiler.register_unary_op(ops.lower_op) -def lower_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).lower() - - -@scalar_op_compiler.register_unary_op(ops.upper_op) -def upper_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).upper() - - -@scalar_op_compiler.register_unary_op(ops.StrLstripOp, pass_op=True) -def str_lstrip_op_impl(x: ibis_types.Value, op: ops.StrStripOp): - return str_lstrip_op(x, to_strip=op.to_strip) - - -@scalar_op_compiler.register_unary_op(ops.StrRstripOp, pass_op=True) -def str_rstrip_op_impl(x: ibis_types.Value, op: ops.StrRstripOp): - return str_rstrip_op(x, to_strip=op.to_strip) - - -@scalar_op_compiler.register_unary_op(ops.StrStripOp, pass_op=True) -def str_strip_op_impl(x: ibis_types.Value, op: ops.StrStripOp): - return str_strip_op(x, to_strip=op.to_strip) - - -@scalar_op_compiler.register_unary_op(ops.isnumeric_op) -def isnumeric_op_impl(x: ibis_types.Value): - # catches all members of the Unicode number class, which matches pandas isnumeric - # see https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_contains - # TODO: Validate correctness, my miss eg ⅕ character - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN+)$") - - -@scalar_op_compiler.register_unary_op(ops.isalpha_op) -def isalpha_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search( - r"^(\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" - ) - - -@scalar_op_compiler.register_unary_op(ops.isdigit_op) -def isdigit_op_impl(x: ibis_types.Value): - # Based on docs, should include superscript/subscript-ed numbers - # Tests however pass only when set to Nd unicode class - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") - - -@scalar_op_compiler.register_unary_op(ops.isdecimal_op) -def isdecimal_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") - - -@scalar_op_compiler.register_unary_op(ops.isalnum_op) -def isalnum_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).re_search( - r"^(\p{N}|\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" - ) - - -@scalar_op_compiler.register_unary_op(ops.isspace_op) -def isspace_op_impl(x: ibis_types.Value): - # All characters are whitespace characters, False for empty string - return typing.cast(ibis_types.StringValue, x).re_search(r"^\s+$") - - -@scalar_op_compiler.register_unary_op(ops.islower_op) -def islower_op_impl(x: ibis_types.Value): - # No upper case characters, min one cased character - # See: https://docs.python.org/3/library/stdtypes.html#str - return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Ll}") & ~typing.cast( - ibis_types.StringValue, x - ).re_search(r"\p{Lu}|\p{Lt}") - - -@scalar_op_compiler.register_unary_op(ops.isupper_op) -def isupper_op_impl(x: ibis_types.Value): - # No lower case characters, min one cased character - # See: https://docs.python.org/3/library/stdtypes.html#str - return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Lu}") & ~typing.cast( - ibis_types.StringValue, x - ).re_search(r"\p{Ll}|\p{Lt}") - - -@scalar_op_compiler.register_unary_op(ops.capitalize_op) -def capitalize_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).capitalize() - - -@scalar_op_compiler.register_unary_op(ops.StrContainsOp, pass_op=True) -def strcontains_op(x: ibis_types.Value, op: ops.StrContainsOp): - return typing.cast(ibis_types.StringValue, x).contains(op.pat) - - -@scalar_op_compiler.register_unary_op(ops.StrContainsRegexOp, pass_op=True) -def contains_regex_op_impl(x: ibis_types.Value, op: ops.StrContainsRegexOp): - return typing.cast(ibis_types.StringValue, x).re_search(op.pat) - - -@scalar_op_compiler.register_unary_op(ops.StrGetOp, pass_op=True) -def strget_op_impl(x: ibis_types.Value, op: ops.StrGetOp): - substr = typing.cast( - ibis_types.StringValue, typing.cast(ibis_types.StringValue, x)[op.i] - ) - return substr.nullif(ibis_types.literal("")) - - -@scalar_op_compiler.register_unary_op(ops.StrPadOp, pass_op=True) -def strpad_op_impl(x: ibis_types.Value, op: ops.StrPadOp): - str_val = typing.cast(ibis_types.StringValue, x) - - # SQL pad operations will truncate, we do not want to truncate though. - pad_length = typing.cast( - ibis_types.IntegerValue, ibis_api.greatest(str_val.length(), op.length) - ) - if op.side == "left": - return str_val.lpad(pad_length, op.fillchar) - elif op.side == "right": - return str_val.rpad(pad_length, op.fillchar) - else: # side == both - # Pad more on right side if can't pad both sides equally - two = typing.cast(ibis_types.IntegerValue, 2) - lpad_amount = ((pad_length - str_val.length()) // two) + str_val.length() - return str_val.lpad( - length=typing.cast(ibis_types.IntegerValue, lpad_amount), pad=op.fillchar - ).rpad(pad_length, op.fillchar) - - -@scalar_op_compiler.register_unary_op(ops.ReplaceStrOp, pass_op=True) -def replacestring_op_impl(x: ibis_types.Value, op: ops.ReplaceStrOp): - pat_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.pat)) - repl_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.repl)) - return typing.cast(ibis_types.StringValue, x).replace(pat_str_value, repl_str_value) - - -@scalar_op_compiler.register_unary_op(ops.RegexReplaceStrOp, pass_op=True) -def replaceregex_op_impl(x: ibis_types.Value, op: ops.RegexReplaceStrOp): - return typing.cast(ibis_types.StringValue, x).re_replace(op.pat, op.repl) - - -@scalar_op_compiler.register_unary_op(ops.StartsWithOp, pass_op=True) -def startswith_op_impl(x: ibis_types.Value, op: ops.StartsWithOp): - any_match = None - for pat in op.pat: - pat_match = typing.cast(ibis_types.StringValue, x).startswith(pat) - if any_match is not None: - any_match = any_match | pat_match - else: - any_match = pat_match - return any_match if any_match is not None else ibis_types.literal(False) - - -@scalar_op_compiler.register_unary_op(ops.EndsWithOp, pass_op=True) -def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp): - any_match = None - for pat in op.pat: - pat_match = typing.cast(ibis_types.StringValue, x).endswith(pat) - if any_match is not None: - any_match = any_match | pat_match - else: - any_match = pat_match - return any_match if any_match is not None else ibis_types.literal(False) - - -@scalar_op_compiler.register_unary_op(ops.StringSplitOp, pass_op=True) -def stringsplit_op_impl(x: ibis_types.Value, op: ops.StringSplitOp): - return typing.cast(ibis_types.StringValue, x).split(delimiter=op.pat) # type: ignore - - -@scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True) -def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp): - str_value = typing.cast(ibis_types.StringValue, x) - return ( - ibis_api.case() - .when( - str_value[0] == "-", - "-" - + strpad_op_impl( - str_value.substr(1), - ops.StrPadOp(length=op.width - 1, fillchar="0", side="left"), - ), - ) - .else_( - strpad_op_impl( - str_value, ops.StrPadOp(length=op.width, fillchar="0", side="left") - ) - ) - .end() - ) - - -@scalar_op_compiler.register_unary_op(ops.StrFindOp, pass_op=True) -def find_op_impl(x: ibis_types.Value, op: ops.StrFindOp): - return typing.cast(ibis_types.StringValue, x).find(op.substr, op.start, op.end) - - -@scalar_op_compiler.register_unary_op(ops.StrExtractOp, pass_op=True) -def extract_op_impl(x: ibis_types.Value, op: ops.StrExtractOp): - return typing.cast(ibis_types.StringValue, x).re_extract(op.pat, op.n) - - -@scalar_op_compiler.register_unary_op(ops.StrSliceOp, pass_op=True) -def slice_op_impl(x: ibis_types.Value, op: ops.StrSliceOp): - return typing.cast(ibis_types.StringValue, x)[op.start : op.end] - - -@scalar_op_compiler.register_unary_op(ops.StrRepeatOp, pass_op=True) -def repeat_op_impl(x: ibis_types.Value, op: ops.StrRepeatOp): - return typing.cast(ibis_types.StringValue, x).repeat(op.repeats) - - -## Datetime Ops -@scalar_op_compiler.register_unary_op(ops.day_op) -def day_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).day().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.date_op) -def date_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).date() - - -@scalar_op_compiler.register_unary_op(ops.iso_day_op) -def iso_day_op_impl(x: ibis_types.Value): - # Plus 1 because iso day of week uses 1-based indexing - return dayofweek_op_impl(x) + 1 - - -@scalar_op_compiler.register_unary_op(ops.iso_week_op) -def iso_week_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).week_of_year() - - -@scalar_op_compiler.register_unary_op(ops.iso_year_op) -def iso_year_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).iso_year() - - -@scalar_op_compiler.register_unary_op(ops.dayofweek_op) -def dayofweek_op_impl(x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x) - .day_of_week.index() - .cast(ibis_dtypes.int64) - ) - - -@scalar_op_compiler.register_unary_op(ops.dayofyear_op) -def dayofyear_op_impl(x: ibis_types.Value): - return ( - typing.cast(ibis_types.TimestampValue, x).day_of_year().cast(ibis_dtypes.int64) - ) - - -@scalar_op_compiler.register_unary_op(ops.hour_op) -def hour_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).hour().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.minute_op) -def minute_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).minute().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.month_op) -def month_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).month().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.quarter_op) -def quarter_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).quarter().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.second_op) -def second_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).second().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.StrftimeOp, pass_op=True) -def strftime_op_impl(x: ibis_types.Value, op: ops.StrftimeOp): - return ( - typing.cast(ibis_types.TimestampValue, x) - .strftime(op.date_format) - .cast(ibis_dtypes.str) - ) - - -@scalar_op_compiler.register_unary_op(ops.UnixSeconds) -def unix_seconds_op_impl(x: ibis_types.TimestampValue): - return x.epoch_seconds() - - -@scalar_op_compiler.register_unary_op(ops.UnixMicros) -def unix_micros_op_impl(x: ibis_types.TimestampValue): - return unix_micros(x) - - -@scalar_op_compiler.register_unary_op(ops.UnixMillis) -def unix_millis_op_impl(x: ibis_types.TimestampValue): - return unix_millis(x) - - -@scalar_op_compiler.register_binary_op(ops.timestamp_diff_op) -def timestamp_diff_op_impl(x: ibis_types.TimestampValue, y: ibis_types.TimestampValue): - return x.delta(y, "microsecond") - - -@scalar_op_compiler.register_binary_op(ops.timestamp_add_op) -def timestamp_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): - return x + y.to_interval("us") - - -@scalar_op_compiler.register_binary_op(ops.timestamp_sub_op) -def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): - return x - y.to_interval("us") - - -@scalar_op_compiler.register_binary_op(ops.date_diff_op) -def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): - return x.delta(y, "day") * int(UNIT_TO_US_CONVERSION_FACTORS["d"]) # type: ignore - - -@scalar_op_compiler.register_binary_op(ops.date_add_op) -def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast(ibis_dtypes.timestamp()) + y.to_interval("us") # type: ignore - - -@scalar_op_compiler.register_binary_op(ops.date_sub_op) -def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): - return x.cast(ibis_dtypes.timestamp()) - y.to_interval("us") # type: ignore - - -@scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) -def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): - supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] - pandas_to_ibis_freqs = {"min": "m"} - if op.freq not in supported_freqs: - raise NotImplementedError( - f"Unsupported freq paramater: {op.freq}" - + " Supported freq parameters are: " - + ",".join(supported_freqs) - ) - if op.freq in pandas_to_ibis_freqs: - ibis_freq = pandas_to_ibis_freqs[op.freq] - else: - ibis_freq = op.freq - result_type = x.type() - result = typing.cast(ibis_types.TimestampValue, x) - result = result.truncate(ibis_freq) # type: ignore - return result.cast(result_type) - - -@scalar_op_compiler.register_binary_op(ops.DatetimeToIntegerLabelOp, pass_op=True) -def datetime_to_integer_label_op_impl( - x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp -): - # Determine if the frequency is fixed by checking if 'op.freq.nanos' is defined. - try: - return datetime_to_integer_label_fixed_frequency(x, y, op) - except ValueError: - return datetime_to_integer_label_non_fixed_frequency(x, y, op) - - -def datetime_to_integer_label_fixed_frequency( - x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp -): - """ - This function handles fixed frequency conversions where the unit can range - from microseconds (us) to days. - """ - us = op.freq.nanos / 1000 - x_int = x.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) - first = calculate_resample_first(y, op.origin) - x_int_label = (x_int - first) // us - return x_int_label - - -def datetime_to_integer_label_non_fixed_frequency( - x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp -): - """ - This function handles non-fixed frequency conversions for units ranging - from weeks to years. - """ - rule_code = op.freq.rule_code - n = op.freq.n - if rule_code == "W-SUN": # Weekly - us = n * 7 * 24 * 60 * 60 * 1000000 - x = x.truncate("week") + ibis_api.interval(days=6) # type: ignore - y = y.truncate("week") + ibis_api.interval(days=6) # type: ignore - x_int = x.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) - first = y.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) - x_int_label = ( - ibis_api.case() - .when(x_int == first, 0) - .else_((x_int - first - 1) // us + 1) # type: ignore - .end() - ) - elif rule_code == "ME": # Monthly - x_int = x.year() * 12 + x.month() - 1 # type: ignore - first = y.year() * 12 + y.month() - 1 # type: ignore - x_int_label = ( - ibis_api.case() - .when(x_int == first, 0) - .else_((x_int - first - 1) // n + 1) # type: ignore - .end() - ) - elif rule_code == "QE-DEC": # Quarterly - x_int = x.year() * 4 + x.quarter() - 1 # type: ignore - first = y.year() * 4 + y.quarter() - 1 # type: ignore - x_int_label = ( - ibis_api.case() - .when(x_int == first, 0) - .else_((x_int - first - 1) // n + 1) # type: ignore - .end() - ) - elif rule_code == "YE-DEC": # Yearly - x_int = x.year() # type: ignore - first = y.year() # type: ignore - x_int_label = ( - ibis_api.case() - .when(x_int == first, 0) - .else_((x_int - first - 1) // n + 1) # type: ignore - .end() - ) - else: - raise ValueError(rule_code) - return x_int_label - - -@scalar_op_compiler.register_binary_op(ops.IntegerLabelToDatetimeOp, pass_op=True) -def integer_label_to_datetime_op_impl( - x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp -): - # Determine if the frequency is fixed by checking if 'op.freq.nanos' is defined. - try: - return integer_label_to_datetime_op_fixed_frequency(x, y, op) - except ValueError: - return integer_label_to_datetime_op_non_fixed_frequency(x, y, op) - - -def integer_label_to_datetime_op_fixed_frequency( - x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp -): - """ - This function handles fixed frequency conversions where the unit can range - from microseconds (us) to days. - """ - us = op.freq.nanos / 1000 - - first = calculate_resample_first(y, op.origin) - - x_label = ( - (x * us + first) # type: ignore - .cast(ibis_dtypes.int64) - .to_timestamp(unit="us") - .cast(ibis_dtypes.Timestamp(timezone="UTC")) - .cast(y.type()) - ) - return x_label - - -def integer_label_to_datetime_op_non_fixed_frequency( - x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp -): - """ - This function handles non-fixed frequency conversions for units ranging - from weeks to years. - """ - rule_code = op.freq.rule_code - n = op.freq.n - if rule_code == "W-SUN": # Weekly - us = n * 7 * 24 * 60 * 60 * 1000000 - first = ( - y.cast(ibis_dtypes.Timestamp(timezone="UTC")).truncate("week") # type: ignore - + ibis_api.interval(days=6) - ).cast(ibis_dtypes.int64) - x_label = ( - (x * us + first) # type: ignore - .cast(ibis_dtypes.int64) - .to_timestamp(unit="us") - .cast(ibis_dtypes.Timestamp(timezone="UTC")) - .cast(y.type()) - ) - elif rule_code == "ME": # Monthly - one = ibis_types.literal(1) - twelve = ibis_types.literal(12) - first = y.year() * twelve + y.month() - one # type: ignore - - x = x * n + first # type: ignore - year = x // twelve # type: ignore - month = (x % twelve) + one # type: ignore - - next_year = (month == twelve).ifelse(year + one, year) - next_month = (month == twelve).ifelse(one, month + one) - next_month_date = ibis_api.timestamp( - typing.cast(ibis_types.IntegerValue, next_year), - typing.cast(ibis_types.IntegerValue, next_month), - 1, - 0, - 0, - 0, - ) - x_label = next_month_date - ibis_api.interval(days=1) - elif rule_code == "QE-DEC": # Quarterly - one = ibis_types.literal(1) - three = ibis_types.literal(3) - four = ibis_types.literal(4) - twelve = ibis_types.literal(12) - first = y.year() * four + y.quarter() - one # type: ignore - - x = x * n + first # type: ignore - year = x // four # type: ignore - month = ((x % four) + one) * three # type: ignore - - next_year = (month == twelve).ifelse(year + one, year) - next_month = (month == twelve).ifelse(one, month + one) - next_month_date = ibis_api.timestamp( - typing.cast(ibis_types.IntegerValue, next_year), - typing.cast(ibis_types.IntegerValue, next_month), - 1, - 0, - 0, - 0, - ) - - x_label = next_month_date - ibis_api.interval(days=1) - elif rule_code == "YE-DEC": # Yearly - one = ibis_types.literal(1) - first = y.year() # type: ignore - x = x * n + first # type: ignore - next_year = x + one # type: ignore - next_month_date = ibis_api.timestamp( - typing.cast(ibis_types.IntegerValue, next_year), - 1, - 1, - 0, - 0, - 0, - ) - x_label = next_month_date - ibis_api.interval(days=1) - - return x_label.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(y.type()) - - -def calculate_resample_first(y: ibis_types.Value, origin): - if origin == "epoch": - return ibis_types.literal(0) - elif origin == "start_day": - return ( - y.cast(ibis_dtypes.date) - .cast(ibis_dtypes.Timestamp(timezone="UTC")) - .cast(ibis_dtypes.int64) - ) - elif origin == "start": - return y.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) - else: - raise ValueError(f"Origin {origin} not supported") - - -@scalar_op_compiler.register_unary_op(ops.time_op) -def time_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).time() - - -@scalar_op_compiler.register_unary_op(ops.year_op) -def year_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) - - -@scalar_op_compiler.register_unary_op(ops.normalize_op) -def normalize_op_impl(x: ibis_types.Value): - result_type = x.type() - result = x.truncate("D") # type: ignore - return result.cast(result_type) - - -# Geo Ops -@scalar_op_compiler.register_unary_op(ops.geo_area_op) -def geo_area_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).area() - - -@scalar_op_compiler.register_unary_op(ops.geo_st_astext_op) -def geo_st_astext_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).as_text() - - -@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) -def geo_st_boundary_op_impl(x: ibis_types.Value): - return st_boundary(x) - - -@scalar_op_compiler.register_binary_op(ops.geo_st_difference_op, pass_op=False) -def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).difference( - typing.cast(ibis_types.GeoSpatialValue, y) - ) - - -@scalar_op_compiler.register_binary_op(ops.GeoStDistanceOp, pass_op=True) -def geo_st_distance_op_impl( - x: ibis_types.Value, y: ibis_types.Value, op: ops.GeoStDistanceOp -): - return st_distance(x, y, op.use_spheroid) - - -@scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op) -def geo_st_geogfromtext_op_impl(x: ibis_types.Value): - # Ibis doesn't seem to provide a dedicated method to cast from string to geography, - # so we use a BigQuery scalar function, st_geogfromtext(), directly. - return st_geogfromtext(x) - - -@scalar_op_compiler.register_binary_op(ops.geo_st_geogpoint_op, pass_op=False) -def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).point( - typing.cast(ibis_types.NumericValue, y) - ) - - -@scalar_op_compiler.register_binary_op(ops.geo_st_intersection_op, pass_op=False) -def geo_st_intersection_op_impl(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).intersection( - typing.cast(ibis_types.GeoSpatialValue, y) - ) - - -@scalar_op_compiler.register_unary_op(ops.geo_st_isclosed_op, pass_op=False) -def geo_st_isclosed_op_impl(x: ibis_types.Value): - return st_isclosed(x) - - -@scalar_op_compiler.register_unary_op(ops.geo_x_op) -def geo_x_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).x() - - -@scalar_op_compiler.register_unary_op(ops.GeoStLengthOp, pass_op=True) -def geo_length_op_impl(x: ibis_types.Value, op: ops.GeoStLengthOp): - # Call the st_length UDF defined in this file (or imported) - return st_length(x, op.use_spheroid) - - -@scalar_op_compiler.register_unary_op(ops.geo_y_op) -def geo_y_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).y() - - -# Parameterized ops -@scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) -def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): - struct_value = typing.cast(ibis_types.StructValue, x) - if isinstance(op.name_or_index, str): - name = op.name_or_index - else: - name = struct_value.names[op.name_or_index] - - result = struct_value[name] - return result.cast(result.type()(nullable=True)).name(name) - - -def numeric_to_datetime( - x: ibis_types.Value, unit: str, safe: bool = False -) -> ibis_types.TimestampValue: - if not isinstance(x, ibis_types.IntegerValue) and not isinstance( - x, ibis_types.FloatingValue - ): - raise TypeError("Non-numerical types are not supposed to reach this function.") - - if unit not in UNIT_TO_US_CONVERSION_FACTORS: - raise ValueError(f"Cannot convert input with unit '{unit}'.") - x_converted = x * typing.cast( - ibis_types.IntegerValue, UNIT_TO_US_CONVERSION_FACTORS[unit] - ) - x_converted = ( - x_converted.try_cast(ibis_dtypes.int64) # type: ignore - if safe - else x_converted.cast(ibis_dtypes.int64) - ) - - # Note: Due to an issue where casting directly to a timestamp - # without a timezone does not work, we first cast to UTC. This - # approach appears to bypass a potential bug in Ibis's cast function, - # allowing for subsequent casting to a timestamp type without timezone - # information. Further investigation is needed to confirm this behavior. - return x_converted.to_timestamp(unit="us").cast( # type: ignore - ibis_dtypes.Timestamp(timezone="UTC") - ) - - -@scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True) -def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): - to_type = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( - op.to_type - ) - if isinstance(x, ibis_types.NullScalar): - return ibis_types.null().cast(to_type) - - # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first. - if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp: - utc_time_type = ibis_dtypes.Timestamp(timezone="UTC") - x_converted = x.try_cast(utc_time_type) if op.safe else x.cast(utc_time_type) - return bigframes.core.compile.ibis_types.cast_ibis_value( - x_converted, to_type, safe=op.safe - ) - - if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time: - # The conversion unit is set to "us" (microseconds) for consistency - # with pandas converting time64[us][pyarrow] to int64[pyarrow]. - return x.delta(ibis_api.time("00:00:00"), part="microsecond") # type: ignore - - if x.type() == ibis_dtypes.int64: - # The conversion unit is set to "us" (microseconds) for consistency - # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], - # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. - unit = "us" - x_converted = numeric_to_datetime(x, unit, safe=op.safe) - if to_type == ibis_dtypes.timestamp: - return ( - x_converted.try_cast(ibis_dtypes.Timestamp()) - if op.safe - else x_converted.cast(ibis_dtypes.Timestamp()) - ) - elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): - return x_converted - elif to_type == ibis_dtypes.time: - return x_converted.time() - - if to_type == ibis_dtypes.json: - if x.type() == ibis_dtypes.string: - return parse_json_in_safe(x) if op.safe else parse_json(x) - if x.type() == ibis_dtypes.bool: - x_bool = typing.cast( - ibis_types.StringValue, - bigframes.core.compile.ibis_types.cast_ibis_value( - x, ibis_dtypes.string, safe=op.safe - ), - ).lower() - return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) - if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): - x_str = bigframes.core.compile.ibis_types.cast_ibis_value( - x, ibis_dtypes.string, safe=op.safe - ) - return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) - - if x.type() == ibis_dtypes.json: - if to_type == ibis_dtypes.int64: - return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) - if to_type == ibis_dtypes.float64: - return ( - cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) - ) - if to_type == ibis_dtypes.bool: - return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) - if to_type == ibis_dtypes.string: - return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) - - # TODO: either inline this function, or push rest of this op into the function - return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) - - -@scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True) -def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): - contains_nulls = any(is_null(value) for value in op.values) - matchable_ibis_values = [] - for item in op.values: - if not is_null(item): - try: - # we want values that *could* be cast to the dtype, but we don't want - # to actually cast it, as that could be lossy (eg float -> int) - item_inferred_type = ibis_types.literal(item).type() - if ( - x.type() == item_inferred_type - or x.type().is_numeric() - and item_inferred_type.is_numeric() - ): - matchable_ibis_values.append(item) - except TypeError: - pass - - if op.match_nulls and contains_nulls: - return x.isnull() | x.isin(matchable_ibis_values) - else: - return x.isin(matchable_ibis_values) - - -@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) -def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): - if x.type() == ibis_dtypes.str: - return x.try_cast(ibis_dtypes.Timestamp(None)) # type: ignore - else: - # Numerical inputs. - if op.format: - x = x.cast(ibis_dtypes.str).to_timestamp(op.format) # type: ignore - else: - # The default unit is set to "ns" (nanoseconds) for consistency - # with pandas, where "ns" is the default unit for datetime operations. - unit = op.unit or "ns" - x = numeric_to_datetime(x, unit) - - return x.cast(ibis_dtypes.Timestamp(None)) # type: ignore - - -@scalar_op_compiler.register_unary_op(ops.ToTimestampOp, pass_op=True) -def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp): - if x.type() == ibis_dtypes.str: - x = ( - typing.cast(ibis_types.StringValue, x).to_timestamp(op.format) - if op.format - else timestamp(x) - ) - else: - # Numerical inputs. - if op.format: - x = x.cast(ibis_dtypes.str).to_timestamp(op.format) # type: ignore - else: - # The default unit is set to "ns" (nanoseconds) for consistency - # with pandas, where "ns" is the default unit for datetime operations. - unit = op.unit or "ns" - x = numeric_to_datetime(x, unit) - - return x.cast(ibis_dtypes.Timestamp(timezone="UTC")) - - -@scalar_op_compiler.register_unary_op(ops.ToTimedeltaOp, pass_op=True) -def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): - return ( - typing.cast(ibis_types.NumericValue, x) * UNIT_TO_US_CONVERSION_FACTORS[op.unit] # type: ignore - ).floor() - - -@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op) -def timedelta_floor_op_impl(x: ibis_types.NumericValue): - return x.floor() - - -@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) -def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): - udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) - - @ibis_udf.scalar.builtin( - name=str(op.function_def.routine_ref), signature=ibis_py_sig - ) - def udf(input): - ... - - x_transformed = udf(x) - if not op.apply_on_null: - return ibis_api.case().when(x.isnull(), x).else_(x_transformed).end() - return x_transformed - - -@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True) -def binary_remote_function_op_impl( - x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp -): - udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) - - @ibis_udf.scalar.builtin( - name=str(op.function_def.routine_ref), signature=ibis_py_sig - ) - def udf(input1, input2): - ... - - x_transformed = udf(x, y) - return x_transformed - - -@scalar_op_compiler.register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True) -def nary_remote_function_op_impl( - *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp -): - udf_sig = op.function_def.signature - ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) - arg_names = tuple(arg.name for arg in udf_sig.input_types) - - @ibis_udf.scalar.builtin( - name=str(op.function_def.routine_ref), - signature=ibis_py_sig, - param_name_overrides=arg_names, - ) - def udf(*inputs): - ... - - result = udf(*operands) - return result - - -@scalar_op_compiler.register_unary_op(ops.MapOp, pass_op=True) -def map_op_impl(x: ibis_types.Value, op: ops.MapOp): - case = ibis_api.case() - for mapping in op.mappings: - case = case.when(x == mapping[0], mapping[1]) - return case.else_(x).end() - - -# Array Ops -@scalar_op_compiler.register_unary_op(ops.ArrayToStringOp, pass_op=True) -def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): - return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter) - - -@scalar_op_compiler.register_unary_op(ops.ArrayIndexOp, pass_op=True) -def array_index_op_impl(x: ibis_types.Value, op: ops.ArrayIndexOp): - res = typing.cast(ibis_types.ArrayValue, x)[op.index] - if x.type().is_string(): - return _null_or_value(res, res != ibis_types.literal("")) - else: - return res - - -@scalar_op_compiler.register_unary_op(ops.ArraySliceOp, pass_op=True) -def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp): - res = typing.cast(ibis_types.ArrayValue, x)[op.start : op.stop : op.step] - if x.type().is_string(): - return _null_or_value(res, res != ibis_types.literal("")) - else: - return res - - -# JSON Ops -@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) -def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): - return json_set(json_obj=x, json_path=op.json_path, json_value=y) - - -@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True) -def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): - # Define a user-defined function whose returned type is dynamically matching the input. - def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore - """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" - ... - - return_type = x.type() - json_extract.__annotations__["return"] = return_type - json_extract_op = ibis_udf.scalar.builtin(json_extract) - return json_extract_op(json_or_json_string=x, json_path=op.json_path) - - -@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True) -def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray): - # Define a user-defined function whose returned type is dynamically matching the input. - def json_extract_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore - """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" - ... - - return_type = x.type() - json_extract_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore - json_extract_op = ibis_udf.scalar.builtin(json_extract_array) - return json_extract_op(json_or_json_string=x, json_path=op.json_path) - - -@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True) -def json_extract_string_array_op_impl( - x: ibis_types.Value, op: ops.JSONExtractStringArray -): - return json_extract_string_array(json_obj=x, json_path=op.json_path) - - -@scalar_op_compiler.register_unary_op(ops.JSONQuery, pass_op=True) -def json_query_op_impl(x: ibis_types.Value, op: ops.JSONQuery): - # Define a user-defined function whose returned type is dynamically matching the input. - def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore - """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" - ... - - return_type = x.type() - json_query.__annotations__["return"] = return_type - json_query_op = ibis_udf.scalar.builtin(json_query) - return json_query_op(json_or_json_string=x, json_path=op.json_path) - - -@scalar_op_compiler.register_unary_op(ops.JSONQueryArray, pass_op=True) -def json_query_array_op_impl(x: ibis_types.Value, op: ops.JSONQueryArray): - # Define a user-defined function whose returned type is dynamically matching the input. - def json_query_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore - """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" - ... - - return_type = x.type() - json_query_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore - json_query_op = ibis_udf.scalar.builtin(json_query_array) - return json_query_op(json_or_json_string=x, json_path=op.json_path) - - -@scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True) -def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): - return parse_json(json_str=x) - - -@scalar_op_compiler.register_unary_op(ops.ToJSONString) -def to_json_string_op_impl(json_obj: ibis_types.Value): - return to_json_string(json_obj=json_obj) - - -@scalar_op_compiler.register_unary_op(ops.JSONValue, pass_op=True) -def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue): - return json_value(json_obj=x, json_path=op.json_path) - - -@scalar_op_compiler.register_unary_op(ops.JSONValueArray, pass_op=True) -def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray): - return json_value_array(json_obj=x, json_path=op.json_path) - - -# Blob Ops -@scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op) -def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value): - return obj_fetch_metadata(obj_ref=obj_ref) - - -@scalar_op_compiler.register_unary_op(ops.ObjGetAccessUrl, pass_op=True) -def obj_get_access_url_op_impl(obj_ref: ibis_types.Value, op: ops.ObjGetAccessUrl): - return obj_get_access_url(obj_ref=obj_ref, mode=op.mode) - - -### Binary Ops -def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): - """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" - - def short_circuit_nulls_inner(binop): - @functools.wraps(binop) - def wrapped_binop(x: ibis_types.Value, y: ibis_types.Value): - if isinstance(x, ibis_types.NullScalar): - return ibis_types.null().cast(type_override or y.type()) - elif isinstance(y, ibis_types.NullScalar): - return ibis_types.null().cast(type_override or x.type()) - else: - return binop(x, y) - - return wrapped_binop - - return short_circuit_nulls_inner - - -@scalar_op_compiler.register_binary_op(ops.strconcat_op) -def concat_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x_string = typing.cast(ibis_types.StringValue, x) - y_string = typing.cast(ibis_types.StringValue, y) - return x_string.concat(y_string) - - -@scalar_op_compiler.register_binary_op(ops.eq_op) -def eq_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x, y = _coerce_comparables(x, y) - return x == y - - -@scalar_op_compiler.register_binary_op(ops.eq_null_match_op) -def eq_nulls_match_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" - x, y = _coerce_comparables(x, y) - literal = ibis_types.literal("$NULL_SENTINEL$") - if hasattr(x, "fill_null"): - left = x.cast(ibis_dtypes.str).fill_null(literal) - right = y.cast(ibis_dtypes.str).fill_null(literal) - else: - left = x.cast(ibis_dtypes.str).fillna(literal) - right = y.cast(ibis_dtypes.str).fillna(literal) - - return left == right - - -@scalar_op_compiler.register_binary_op(ops.ne_op) -def ne_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x, y = _coerce_comparables(x, y) - return x != y - - -def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): - return ibis_api.ifelse( - where_value, - value, - ibis_types.null(), - ) - - -def _coerce_comparables( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.type().is_boolean() and not y.type().is_boolean(): - x = x.cast(ibis_dtypes.int64) - elif y.type().is_boolean() and not x.type().is_boolean(): - y = y.cast(ibis_dtypes.int64) - return x, y - - -@scalar_op_compiler.register_binary_op(ops.and_op) -def and_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by - # implementing three-valued logic ourselves. For AND, when we encounter a - # NULL value, we only know when the result is FALSE, otherwise the result - # is unknown (NULL). See: truth table at - # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if isinstance(x, ibis_types.NullScalar): - return _null_or_value(y, y == ibis_types.literal(False)) - - if isinstance(y, ibis_types.NullScalar): - return _null_or_value(x, x == ibis_types.literal(False)) - return typing.cast(ibis_types.BooleanValue, x) & typing.cast( - ibis_types.BooleanValue, y - ) - - -@scalar_op_compiler.register_binary_op(ops.or_op) -def or_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by - # implementing three-valued logic ourselves. For OR, when we encounter a - # NULL value, we only know when the result is TRUE, otherwise the result - # is unknown (NULL). See: truth table at - # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR - if isinstance(x, ibis_types.NullScalar): - return _null_or_value(y, y == ibis_types.literal(True)) - - if isinstance(y, ibis_types.NullScalar): - return _null_or_value(x, x == ibis_types.literal(True)) - return typing.cast(ibis_types.BooleanValue, x) | typing.cast( - ibis_types.BooleanValue, y - ) - - -@scalar_op_compiler.register_binary_op(ops.xor_op) -def xor_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.BooleanValue, x) ^ typing.cast( - ibis_types.BooleanValue, y - ) - - -@scalar_op_compiler.register_binary_op(ops.add_op) -@short_circuit_nulls() -def add_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return ibis_types.null() - return x + y # type: ignore - - -@scalar_op_compiler.register_binary_op(ops.sub_op) -@short_circuit_nulls() -def sub_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) - typing.cast( - ibis_types.NumericValue, y - ) - - -@scalar_op_compiler.register_binary_op(ops.mul_op) -@short_circuit_nulls() -def mul_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) * typing.cast( - ibis_types.NumericValue, y - ) - - -@scalar_op_compiler.register_binary_op(ops.div_op) -@short_circuit_nulls(ibis_dtypes.float) -def div_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - return typing.cast(ibis_types.NumericValue, x) / typing.cast( - ibis_types.NumericValue, y - ) - - -@scalar_op_compiler.register_binary_op(ops.pow_op) -@short_circuit_nulls(ibis_dtypes.float) -def pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.type().is_integer() and y.type().is_integer(): - return _int_pow_op(x, y) - else: - return _float_pow_op(x, y) - - -@scalar_op_compiler.register_binary_op(ops.unsafe_pow_op) -@short_circuit_nulls(ibis_dtypes.float) -def unsafe_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - """For internal use only - where domain and overflow checks are not needed.""" - return typing.cast(ibis_types.NumericValue, x) ** typing.cast( - ibis_types.NumericValue, y - ) - - -def _int_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Need to avoid any error cases - should produce NaN instead - # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow - x_as_decimal = typing.cast( - ibis_types.NumericValue, - x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)), - ) - y_val = typing.cast(ibis_types.NumericValue, y) - - # BQ POW() function outputs FLOAT64, which can lose precision. - # Therefore, we do math in NUMERIC and cast back down after. - # Also, explicit bounds checks, pandas will silently overflow. - pow_result = x_as_decimal**y_val - overflow_cond = (pow_result > _ibis_num((2**63) - 1)) | ( - pow_result < _ibis_num(-(2**63)) - ) - - return ( - ibis_api.case() - .when((overflow_cond), ibis_types.null()) - .else_(pow_result.cast(ibis_dtypes.int64)) - .end() - ) - - -def _float_pow_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. - # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow - x_val = typing.cast(ibis_types.NumericValue, x) - y_val = typing.cast(ibis_types.NumericValue, y) - - overflow_cond = (x_val != _ZERO) & ((y_val * x_val.abs().ln()) > _FLOAT64_EXP_BOUND) - - # Float64 lose integer precision beyond 2**53, beyond this insufficient precision to get parity - exp_too_big = y_val.abs() > _ibis_num(2**53) - # Treat very large exponents as +=INF - norm_exp = exp_too_big.ifelse(_INF * y_val.sign(), y_val) - - pow_result = x_val**norm_exp - - # This cast is dangerous, need to only excuted where y_val has been bounds-checked - # Ibis needs try_cast binding to bq safe_cast - exponent_is_whole = y_val.cast(ibis_dtypes.int64) == y_val - odd_exponent = (x_val < _ZERO) & ( - y_val.cast(ibis_dtypes.int64) % _ibis_num(2) == _ibis_num(1) - ) - infinite_base = x_val.abs() == _INF - - return ( - ibis_api.case() - # Might be able to do something more clever with x_val==0 case - .when(y_val == _ZERO, _ibis_num(1)) - .when( - x_val == _ibis_num(1), _ibis_num(1) - ) # Need to ignore exponent, even if it is NA - .when( - (x_val == _ZERO) & (y_val < _ZERO), _INF - ) # This case would error POW function in BQ - .when(infinite_base, pow_result) - .when( - exp_too_big, pow_result - ) # Bigquery can actually handle the +-inf cases gracefully - .when((x_val < _ZERO) & (~exponent_is_whole), _NAN) - .when( - overflow_cond, _INF * odd_exponent.ifelse(_ibis_num(-1), _ibis_num(1)) - ) # finite overflows would cause bq to error - .else_(pow_result) - .end() - ) - - -@scalar_op_compiler.register_binary_op(ops.lt_op) -@short_circuit_nulls(ibis_dtypes.bool) -def lt_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x, y = _coerce_comparables(x, y) - return x < y - - -@scalar_op_compiler.register_binary_op(ops.le_op) -@short_circuit_nulls(ibis_dtypes.bool) -def le_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x, y = _coerce_comparables(x, y) - return x <= y - - -@scalar_op_compiler.register_binary_op(ops.gt_op) -@short_circuit_nulls(ibis_dtypes.bool) -def gt_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x, y = _coerce_comparables(x, y) - return x > y - - -@scalar_op_compiler.register_binary_op(ops.ge_op) -@short_circuit_nulls(ibis_dtypes.bool) -def ge_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x, y = _coerce_comparables(x, y) - return x >= y - - -@scalar_op_compiler.register_binary_op(ops.floordiv_op) -@short_circuit_nulls(ibis_dtypes.int) -def floordiv_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - x_numeric = typing.cast(ibis_types.NumericValue, x) - y_numeric = typing.cast(ibis_types.NumericValue, y) - floordiv_expr = x_numeric // y_numeric - - # DIV(N, 0) will error in bigquery, but needs to return 0 for int, and inf for float in BQ so we short-circuit in this case. - # Multiplying left by zero propogates nulls. - zero_result = _INF if (x.type().is_floating() or y.type().is_floating()) else _ZERO - return ( - ibis_api.case() - .when(y_numeric == _ZERO, zero_result * x_numeric) - .else_(floordiv_expr) - .end() - ) - - -def _is_bignumeric(x: ibis_types.Value): - if not isinstance(x, ibis_types.DecimalValue): - return False - # Should be exactly 76 for bignumeric - return x.precision > 70 # type: ignore - - -def _is_numeric(x: ibis_types.Value): - # either big-numeric or numeric - return isinstance(x, ibis_types.DecimalValue) - - -@scalar_op_compiler.register_binary_op(ops.mod_op) -@short_circuit_nulls() -def mod_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. - op = y.op() - if isinstance(op, ibis_generic.Literal) and op.value == 0: - return ibis_types.null().cast(x.type()) - - if x.type().is_integer() and y.type().is_integer(): - # both are ints, no casting necessary - return _int_mod( - typing.cast(ibis_types.IntegerValue, x), - typing.cast(ibis_types.IntegerValue, y), - ) - - else: - # bigquery doens't support float mod, so just cast to bignumeric and hope for the best - x_numeric = typing.cast( - ibis_types.DecimalValue, - x.cast(ibis_dtypes.Decimal(precision=76, scale=38, nullable=True)), - ) - y_numeric = typing.cast( - ibis_types.DecimalValue, - y.cast(ibis_dtypes.Decimal(precision=76, scale=38, nullable=True)), - ) - mod_numeric = _bignumeric_mod(x_numeric, y_numeric) # type: ignore - - # Cast back down based on original types - if _is_bignumeric(x) or _is_bignumeric(y): - return mod_numeric - if _is_numeric(x) or _is_numeric(y): - return mod_numeric.cast(ibis_dtypes.Decimal(38, 9)) - else: - return mod_numeric.cast(ibis_dtypes.float64) - - -def _bignumeric_mod( - x: ibis_types.IntegerValue, - y: ibis_types.IntegerValue, -): - # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. - op = y.op() - if isinstance(op, ibis_generic.Literal) and op.value == 0: - return ibis_types.null().cast(x.type()) - - bq_mod = x % y # Bigquery will maintain x sign here - - # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) - return ( - ibis_api.case() - .when( - y == _ZERO, - _NAN * x, - ) # Dummy op to propogate nulls and type from x arg - .when( - (y < _ZERO) & (bq_mod > _ZERO), (y + bq_mod) - ) # Convert positive result to negative - .when( - (y > _ZERO) & (bq_mod < _ZERO), (y + bq_mod) - ) # Convert negative result to positive - .else_(bq_mod) - .end() - ) - - -def _int_mod( - x: ibis_types.IntegerValue, - y: ibis_types.IntegerValue, -): - # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. - op = y.op() - if isinstance(op, ibis_generic.Literal) and op.value == 0: - return ibis_types.null().cast(x.type()) - - bq_mod = x % y # Bigquery will maintain x sign here - - # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) - return ( - ibis_api.case() - .when( - y == _ZERO, - _ZERO * x, - ) # Dummy op to propogate nulls and type from x arg - .when( - (y < _ZERO) & (bq_mod > _ZERO), (y + bq_mod) - ) # Convert positive result to negative - .when( - (y > _ZERO) & (bq_mod < _ZERO), (y + bq_mod) - ) # Convert negative result to positive - .else_(bq_mod) - .end() - ) - - -@scalar_op_compiler.register_binary_op(ops.fillna_op) -def fillna_op( - x: ibis_types.Value, - y: ibis_types.Value, -): - if hasattr(x, "fill_null"): - return x.fill_null(typing.cast(ibis_types.Scalar, y)) - else: - return x.fillna(typing.cast(ibis_types.Scalar, y)) - - -@scalar_op_compiler.register_binary_op(ops.round_op) -def round_op(x: ibis_types.Value, y: ibis_types.Value): - if x.type().is_integer(): - # bq produces float64, but pandas returns int - return ( - typing.cast(ibis_types.NumericValue, x) - .round(digits=typing.cast(ibis_types.IntegerValue, y)) - .cast(ibis_dtypes.int64) - ) - return typing.cast(ibis_types.NumericValue, x).round( - digits=typing.cast(ibis_types.IntegerValue, y) - ) - - -@scalar_op_compiler.register_binary_op(ops.coalesce_op) -def coalesce_impl( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.name("name").equals(y.name("name")): - return x - else: - return ibis_api.coalesce(x, y) - - -@scalar_op_compiler.register_binary_op(ops.maximum_op) -def maximum_impl( - value: ibis_types.Value, - lower: ibis_types.Value, -): - # Note: propagates nulls - return ( - ibis_api.case().when(lower.isnull() | (value < lower), lower).else_(value).end() - ) - - -@scalar_op_compiler.register_binary_op(ops.minimum_op) -def minimum_impl( - value: ibis_types.Value, - upper: ibis_types.Value, -): - # Note: propagates nulls - return ( - ibis_api.case().when(upper.isnull() | (value > upper), upper).else_(value).end() - ) - - -@scalar_op_compiler.register_binary_op(ops.cosine_distance_op) -def cosine_distance_impl( - vector1: ibis_types.Value, - vector2: ibis_types.Value, -): - return vector_distance(vector1, vector2, "COSINE") - - -@scalar_op_compiler.register_binary_op(ops.euclidean_distance_op) -def euclidean_distance_impl( - vector1: ibis_types.Value, - vector2: ibis_types.Value, -): - return vector_distance(vector1, vector2, "EUCLIDEAN") - - -@scalar_op_compiler.register_binary_op(ops.manhattan_distance_op) -def manhattan_distance_impl( - vector1: ibis_types.Value, - vector2: ibis_types.Value, -): - return vector_distance(vector1, vector2, "MANHATTAN") - - -# Blob Ops -@scalar_op_compiler.register_binary_op(ops.obj_make_ref_op) -def obj_make_ref_op(x: ibis_types.Value, y: ibis_types.Value): - return obj_make_ref(uri=x, authorizer=y) - - -# Ternary Operations -@scalar_op_compiler.register_ternary_op(ops.where_op) -def where_op( - original: ibis_types.Value, - condition: ibis_types.Value, - replacement: ibis_types.Value, -) -> ibis_types.Value: - """Returns x if y is true, otherwise returns z.""" - return ibis_api.case().when(condition, original).else_(replacement).end() # type: ignore - - -@scalar_op_compiler.register_ternary_op(ops.clip_op) -def clip_op( - original: ibis_types.Value, - lower: ibis_types.Value, - upper: ibis_types.Value, -) -> ibis_types.Value: - """Clips value to lower and upper bounds.""" - if isinstance(lower, ibis_types.NullScalar) and ( - not isinstance(upper, ibis_types.NullScalar) - ): - return ibis_api.least(original, upper) - elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( - upper, ibis_types.NullScalar - ): - return ibis_api.greatest(original, lower) - elif isinstance(lower, ibis_types.NullScalar) and ( - isinstance(upper, ibis_types.NullScalar) - ): - return original - else: - # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound - return ibis_api.greatest(ibis_api.least(original, upper), lower) - - -# N-ary Operations -@scalar_op_compiler.register_nary_op(ops.case_when_op) -def case_when_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value: - # ibis can handle most type coercions, but we need to force bool -> int - # TODO: dispatch coercion depending on bigframes dtype schema - result_values = cases_and_outputs[1::2] - do_upcast_bool = any(t.type().is_numeric() for t in result_values) - if do_upcast_bool: - # Just need to upcast to int, ibis can handle further coercion - result_values = tuple( - val.cast(ibis_dtypes.int64) if val.type().is_boolean() else val - for val in result_values - ) - - case_val = ibis_api.case() - for predicate, output in zip(cases_and_outputs[::2], result_values): - case_val = case_val.when(predicate, output) - return case_val.end() # type: ignore - - -@scalar_op_compiler.register_nary_op(ops.SqlScalarOp, pass_op=True) -def sql_scalar_op_impl(*operands: ibis_types.Value, op: ops.SqlScalarOp): - return ibis_generic.SqlScalar( - op.sql_template, - values=tuple(typing.cast(ibis_generic.Value, expr.op()) for expr in operands), - output_type=bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( - op.output_type() - ), - ).to_expr() - - -@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True) -def struct_op_impl( - *values: ibis_types.Value, op: ops.StructOp -) -> ibis_types.StructValue: - data = {} - for i, value in enumerate(values): - data[op.column_names[i]] = value - - return ibis_types.struct(data) - - -@scalar_op_compiler.register_nary_op(ops.RowKey, pass_op=True) -def rowkey_op_impl(*values: ibis_types.Value, op: ops.RowKey) -> ibis_types.Value: - return bigframes.core.compile.default_ordering.gen_row_key(values) - - -# Helpers -def is_null(value) -> bool: - # float NaN/inf should be treated as distinct from 'true' null values - return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) - - -def _ibis_num(number: float): - return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) - - -@ibis_udf.scalar.builtin -def st_geogfromtext(a: str) -> ibis_dtypes.geography: # type: ignore - """Convert string to geography.""" - - -@ibis_udf.scalar.builtin -def timestamp(a: str) -> ibis_dtypes.timestamp: # type: ignore - """Convert string to timestamp.""" - - -@ibis_udf.scalar.builtin -def unix_millis(a: ibis_dtypes.timestamp) -> int: # type: ignore - """Convert a timestamp to milliseconds""" - - -@ibis_udf.scalar.builtin -def st_boundary(a: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ignore - """Find the boundary of a geography.""" - - -@ibis_udf.scalar.builtin -def st_distance(a: ibis_dtypes.geography, b: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore - """Convert string to geography.""" - - -@ibis_udf.scalar.builtin -def st_length(geog: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore - """ST_LENGTH BQ builtin. This body is never executed.""" - pass - - -@ibis_udf.scalar.builtin -def unix_micros(a: ibis_dtypes.timestamp) -> int: # type: ignore - """Convert a timestamp to microseconds""" - - -# Need these because ibis otherwise tries to do casts to int that can fail -@ibis_udf.scalar.builtin(name="floor") -def float_floor(a: float) -> float: - """Convert string to timestamp.""" - return 0 # pragma: NO COVER - - -@ibis_udf.scalar.builtin(name="ceil") -def float_ceil(a: float) -> float: - """Convert string to timestamp.""" - return 0 # pragma: NO COVER - - -@ibis_udf.scalar.builtin(name="parse_json") -def parse_json(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] - """Converts a JSON-formatted STRING value to a JSON value.""" - - -@ibis_udf.scalar.builtin(name="SAFE.PARSE_JSON") -def parse_json_in_safe(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] - """Converts a JSON-formatted STRING value to a JSON value in the safe mode.""" - - -@ibis_udf.scalar.builtin(name="json_set") -def json_set( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String, json_value -) -> ibis_dtypes.JSON: - """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" - - -@ibis_udf.scalar.builtin(name="json_extract_string_array") -def json_extract_string_array( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String -) -> ibis_dtypes.Array[ibis_dtypes.String]: - """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs.""" - - -@ibis_udf.scalar.builtin(name="to_json_string") -def to_json_string( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, -) -> ibis_dtypes.String: - """Convert JSON to STRING.""" - - -@ibis_udf.scalar.builtin(name="json_value") -def json_value( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String -) -> ibis_dtypes.String: - """Retrieve value of a JSON field as plain STRING.""" - - -@ibis_udf.scalar.builtin(name="json_value_array") -def json_value_array( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String -) -> ibis_dtypes.Array[ibis_dtypes.String]: - """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs.""" - - -@ibis_udf.scalar.builtin(name="INT64") -def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] - """Converts a JSON number to a SQL INT64 value.""" - - -@ibis_udf.scalar.builtin(name="SAFE.INT64") -def cast_json_to_int64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] - """Converts a JSON number to a SQL INT64 value in the safe mode.""" - - -@ibis_udf.scalar.builtin(name="FLOAT64") -def cast_json_to_float64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] - """Attempts to convert a JSON value to a SQL FLOAT64 value.""" - - -@ibis_udf.scalar.builtin(name="SAFE.FLOAT64") -def cast_json_to_float64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] - """Attempts to convert a JSON value to a SQL FLOAT64 value.""" - - -@ibis_udf.scalar.builtin(name="BOOL") -def cast_json_to_bool(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] - """Attempts to convert a JSON value to a SQL BOOL value.""" - - -@ibis_udf.scalar.builtin(name="SAFE.BOOL") -def cast_json_to_bool_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] - """Attempts to convert a JSON value to a SQL BOOL value.""" - - -@ibis_udf.scalar.builtin(name="STRING") -def cast_json_to_string(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] - """Attempts to convert a JSON value to a SQL STRING value.""" - - -@ibis_udf.scalar.builtin(name="SAFE.STRING") -def cast_json_to_string_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] - """Attempts to convert a JSON value to a SQL STRING value.""" - - -@ibis_udf.scalar.builtin(name="ML.DISTANCE") -def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: # type: ignore[empty-body] - """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" - - -@ibis_udf.scalar.builtin(name="OBJ.FETCH_METADATA") -def obj_fetch_metadata(obj_ref: _OBJ_REF_IBIS_DTYPE) -> _OBJ_REF_IBIS_DTYPE: # type: ignore - """Fetch metadata from ObjectRef Struct.""" - - -@ibis_udf.scalar.builtin(name="OBJ.MAKE_REF") -def obj_make_ref(uri: str, authorizer: str) -> _OBJ_REF_IBIS_DTYPE: # type: ignore - """Make ObjectRef Struct from uri and connection.""" - - -@ibis_udf.scalar.builtin(name="OBJ.GET_ACCESS_URL") -def obj_get_access_url(obj_ref: _OBJ_REF_IBIS_DTYPE, mode: ibis_dtypes.String) -> ibis_dtypes.JSON: # type: ignore - """Get access url (as ObjectRefRumtime JSON) from ObjectRef.""" - - -@ibis_udf.scalar.builtin(name="ltrim") -def str_lstrip_op( # type: ignore[empty-body] - x: ibis_dtypes.String, to_strip: ibis_dtypes.String -) -> ibis_dtypes.String: - """Remove leading and trailing characters.""" - - -@ibis_udf.scalar.builtin -def st_isclosed(a: ibis_dtypes.geography) -> ibis_dtypes.boolean: # type: ignore - """Checks if a geography is closed.""" - - -@ibis_udf.scalar.builtin(name="rtrim") -def str_rstrip_op( # type: ignore[empty-body] - x: ibis_dtypes.String, to_strip: ibis_dtypes.String -) -> ibis_dtypes.String: - """Remove leading and trailing characters.""" - - -@ibis_udf.scalar.builtin(name="trim") -def str_strip_op( # type: ignore[empty-body] - x: ibis_dtypes.String, to_strip: ibis_dtypes.String -) -> ibis_dtypes.String: - """Remove leading and trailing characters.""" diff --git a/bigframes/core/compile/scalar_op_registry.py b/bigframes/core/compile/scalar_op_registry.py new file mode 100644 index 0000000000..ed8c1103d5 --- /dev/null +++ b/bigframes/core/compile/scalar_op_registry.py @@ -0,0 +1,2086 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +import typing + +import bigframes_vendored.ibis.expr.api as ibis_api +import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +import bigframes_vendored.ibis.expr.operations.generic as ibis_generic +import bigframes_vendored.ibis.expr.operations.udf as ibis_udf +import bigframes_vendored.ibis.expr.types as ibis_types +import numpy as np +import pandas as pd + +from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS +import bigframes.core.compile.default_ordering +import bigframes.core.compile.ibis_types +from bigframes.core.compile.scalar_op_compiler import ( + scalar_op_compiler, # TODO(tswast): avoid import of variables +) +import bigframes.operations as ops + +_ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) +_NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) +_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) +_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) + +# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result +# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) +# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. +_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) + +_OBJ_REF_STRUCT_SCHEMA = ( + ("uri", ibis_dtypes.String), + ("version", ibis_dtypes.String), + ("authorizer", ibis_dtypes.String), + ("details", ibis_dtypes.JSON), +) +_OBJ_REF_IBIS_DTYPE = ibis_dtypes.Struct.from_tuples(_OBJ_REF_STRUCT_SCHEMA) # type: ignore + + +### Unary Ops +@scalar_op_compiler.register_unary_op(ops.hash_op) +def hash_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.IntegerValue, x).hash() + + +# Trig Functions +@scalar_op_compiler.register_unary_op(ops.sin_op) +def sin_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).sin() + + +@scalar_op_compiler.register_unary_op(ops.cos_op) +def cos_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).cos() + + +@scalar_op_compiler.register_unary_op(ops.tan_op) +def tan_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).tan() + + +# Inverse trig functions +@scalar_op_compiler.register_unary_op(ops.arcsin_op) +def arcsin_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.asin()) + + +@scalar_op_compiler.register_unary_op(ops.arccos_op) +def arccos_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.acos()) + + +@scalar_op_compiler.register_unary_op(ops.arctan_op) +def arctan_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan() + + +@scalar_op_compiler.register_binary_op(ops.arctan2_op) +def arctan2_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan2( + typing.cast(ibis_types.NumericValue, y) + ) + + +# Hyperbolic trig functions +# BQ has these functions, but Ibis doesn't +@scalar_op_compiler.register_unary_op(ops.sinh_op) +def sinh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sinh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) + + +@scalar_op_compiler.register_unary_op(ops.cosh_op) +def cosh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + cosh_result = (numeric_value.exp() + (numeric_value.negate()).exp()) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, cosh_result) + + +@scalar_op_compiler.register_unary_op(ops.tanh_op) +def tanh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( + numeric_value.exp() + (numeric_value.negate()).exp() + ) + # Beyond +-20, is effectively just the sign function + domain = numeric_value.abs() < _ibis_num(20) + return (~domain).ifelse(numeric_value.sign(), tanh_result) + + +@scalar_op_compiler.register_unary_op(ops.arcsinh_op) +def arcsinh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() + return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() + + +@scalar_op_compiler.register_unary_op(ops.arccosh_op) +def arccosh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() + acosh_result = (numeric_value + sqrt_part).ln() + domain = numeric_value >= _ibis_num(1) + return (~domain).ifelse(_NAN, acosh_result) + + +@scalar_op_compiler.register_unary_op(ops.arctanh_op) +def arctanh_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() < _ibis_num(1) + numerator = numeric_value + _ibis_num(1) + denominator = _ibis_num(1) - numeric_value + ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) + atanh_result = ln_input.ln().div(2) + + out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( + _INF * numeric_value, _NAN + ) + + return (~domain).ifelse(out_of_domain, atanh_result) + + +# Numeric Ops +@scalar_op_compiler.register_unary_op(ops.floor_op) +def floor_op_impl(x: ibis_types.Value): + x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_integer(): + return x_numeric.cast(ibis_dtypes.Float64()) + if x_numeric.type().is_floating(): + # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow + return float_floor(x_numeric) + else: # numeric + return x_numeric.floor() + + +@scalar_op_compiler.register_unary_op(ops.ceil_op) +def ceil_op_impl(x: ibis_types.Value): + x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_integer(): + return x_numeric.cast(ibis_dtypes.Float64()) + if x_numeric.type().is_floating(): + # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow + return float_ceil(x_numeric) + else: # numeric + return x_numeric.ceil() + + +@scalar_op_compiler.register_unary_op(ops.abs_op) +def abs_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).abs() + + +@scalar_op_compiler.register_unary_op(ops.pos_op) +def pos_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x) + + +@scalar_op_compiler.register_unary_op(ops.neg_op) +def neg_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).negate() + + +@scalar_op_compiler.register_unary_op(ops.sqrt_op) +def sqrt_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value >= _ZERO + return (~domain).ifelse(_NAN, numeric_value.sqrt()) + + +@scalar_op_compiler.register_unary_op(ops.log10_op) +def log10_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.log10()) + + +@scalar_op_compiler.register_unary_op(ops.ln_op) +def ln_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.ln()) + + +@scalar_op_compiler.register_unary_op(ops.log1p_op) +def log1p_op_impl(x: ibis_types.Value): + return ln_op_impl(_ibis_num(1) + x) + + +@scalar_op_compiler.register_unary_op(ops.exp_op) +def exp_op_impl(x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, numeric_value.exp()) + + +@scalar_op_compiler.register_unary_op(ops.expm1_op) +def expm1_op_impl(x: ibis_types.Value): + return exp_op_impl(x) - _ibis_num(1) + + +@scalar_op_compiler.register_unary_op(ops.invert_op) +def invert_op_impl(x: ibis_types.Value): + return x.__invert__() # type: ignore + + +## String Operation +@scalar_op_compiler.register_unary_op(ops.len_op) +def len_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).length().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.reverse_op) +def reverse_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).reverse() + + +@scalar_op_compiler.register_unary_op(ops.lower_op) +def lower_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).lower() + + +@scalar_op_compiler.register_unary_op(ops.upper_op) +def upper_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).upper() + + +@scalar_op_compiler.register_unary_op(ops.StrLstripOp, pass_op=True) +def str_lstrip_op_impl(x: ibis_types.Value, op: ops.StrStripOp): + return str_lstrip_op(x, to_strip=op.to_strip) + + +@scalar_op_compiler.register_unary_op(ops.StrRstripOp, pass_op=True) +def str_rstrip_op_impl(x: ibis_types.Value, op: ops.StrRstripOp): + return str_rstrip_op(x, to_strip=op.to_strip) + + +@scalar_op_compiler.register_unary_op(ops.StrStripOp, pass_op=True) +def str_strip_op_impl(x: ibis_types.Value, op: ops.StrStripOp): + return str_strip_op(x, to_strip=op.to_strip) + + +@scalar_op_compiler.register_unary_op(ops.isnumeric_op) +def isnumeric_op_impl(x: ibis_types.Value): + # catches all members of the Unicode number class, which matches pandas isnumeric + # see https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_contains + # TODO: Validate correctness, my miss eg ⅕ character + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN+)$") + + +@scalar_op_compiler.register_unary_op(ops.isalpha_op) +def isalpha_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +@scalar_op_compiler.register_unary_op(ops.isdigit_op) +def isdigit_op_impl(x: ibis_types.Value): + # Based on docs, should include superscript/subscript-ed numbers + # Tests however pass only when set to Nd unicode class + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +@scalar_op_compiler.register_unary_op(ops.isdecimal_op) +def isdecimal_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +@scalar_op_compiler.register_unary_op(ops.isalnum_op) +def isalnum_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{N}|\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +@scalar_op_compiler.register_unary_op(ops.isspace_op) +def isspace_op_impl(x: ibis_types.Value): + # All characters are whitespace characters, False for empty string + return typing.cast(ibis_types.StringValue, x).re_search(r"^\s+$") + + +@scalar_op_compiler.register_unary_op(ops.islower_op) +def islower_op_impl(x: ibis_types.Value): + # No upper case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Ll}") & ~typing.cast( + ibis_types.StringValue, x + ).re_search(r"\p{Lu}|\p{Lt}") + + +@scalar_op_compiler.register_unary_op(ops.isupper_op) +def isupper_op_impl(x: ibis_types.Value): + # No lower case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search(r"\p{Lu}") & ~typing.cast( + ibis_types.StringValue, x + ).re_search(r"\p{Ll}|\p{Lt}") + + +@scalar_op_compiler.register_unary_op(ops.capitalize_op) +def capitalize_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).capitalize() + + +@scalar_op_compiler.register_unary_op(ops.StrContainsOp, pass_op=True) +def strcontains_op(x: ibis_types.Value, op: ops.StrContainsOp): + return typing.cast(ibis_types.StringValue, x).contains(op.pat) + + +@scalar_op_compiler.register_unary_op(ops.StrContainsRegexOp, pass_op=True) +def contains_regex_op_impl(x: ibis_types.Value, op: ops.StrContainsRegexOp): + return typing.cast(ibis_types.StringValue, x).re_search(op.pat) + + +@scalar_op_compiler.register_unary_op(ops.StrGetOp, pass_op=True) +def strget_op_impl(x: ibis_types.Value, op: ops.StrGetOp): + substr = typing.cast( + ibis_types.StringValue, typing.cast(ibis_types.StringValue, x)[op.i] + ) + return substr.nullif(ibis_types.literal("")) + + +@scalar_op_compiler.register_unary_op(ops.StrPadOp, pass_op=True) +def strpad_op_impl(x: ibis_types.Value, op: ops.StrPadOp): + str_val = typing.cast(ibis_types.StringValue, x) + + # SQL pad operations will truncate, we do not want to truncate though. + pad_length = typing.cast( + ibis_types.IntegerValue, ibis_api.greatest(str_val.length(), op.length) + ) + if op.side == "left": + return str_val.lpad(pad_length, op.fillchar) + elif op.side == "right": + return str_val.rpad(pad_length, op.fillchar) + else: # side == both + # Pad more on right side if can't pad both sides equally + two = typing.cast(ibis_types.IntegerValue, 2) + lpad_amount = ((pad_length - str_val.length()) // two) + str_val.length() + return str_val.lpad( + length=typing.cast(ibis_types.IntegerValue, lpad_amount), pad=op.fillchar + ).rpad(pad_length, op.fillchar) + + +@scalar_op_compiler.register_unary_op(ops.ReplaceStrOp, pass_op=True) +def replacestring_op_impl(x: ibis_types.Value, op: ops.ReplaceStrOp): + pat_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.pat)) + repl_str_value = typing.cast(ibis_types.StringValue, ibis_types.literal(op.repl)) + return typing.cast(ibis_types.StringValue, x).replace(pat_str_value, repl_str_value) + + +@scalar_op_compiler.register_unary_op(ops.RegexReplaceStrOp, pass_op=True) +def replaceregex_op_impl(x: ibis_types.Value, op: ops.RegexReplaceStrOp): + return typing.cast(ibis_types.StringValue, x).re_replace(op.pat, op.repl) + + +@scalar_op_compiler.register_unary_op(ops.StartsWithOp, pass_op=True) +def startswith_op_impl(x: ibis_types.Value, op: ops.StartsWithOp): + any_match = None + for pat in op.pat: + pat_match = typing.cast(ibis_types.StringValue, x).startswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + +@scalar_op_compiler.register_unary_op(ops.EndsWithOp, pass_op=True) +def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp): + any_match = None + for pat in op.pat: + pat_match = typing.cast(ibis_types.StringValue, x).endswith(pat) + if any_match is not None: + any_match = any_match | pat_match + else: + any_match = pat_match + return any_match if any_match is not None else ibis_types.literal(False) + + +@scalar_op_compiler.register_unary_op(ops.StringSplitOp, pass_op=True) +def stringsplit_op_impl(x: ibis_types.Value, op: ops.StringSplitOp): + return typing.cast(ibis_types.StringValue, x).split(delimiter=op.pat) # type: ignore + + +@scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True) +def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp): + str_value = typing.cast(ibis_types.StringValue, x) + return ( + ibis_api.case() + .when( + str_value[0] == "-", + "-" + + strpad_op_impl( + str_value.substr(1), + ops.StrPadOp(length=op.width - 1, fillchar="0", side="left"), + ), + ) + .else_( + strpad_op_impl( + str_value, ops.StrPadOp(length=op.width, fillchar="0", side="left") + ) + ) + .end() + ) + + +@scalar_op_compiler.register_unary_op(ops.StrFindOp, pass_op=True) +def find_op_impl(x: ibis_types.Value, op: ops.StrFindOp): + return typing.cast(ibis_types.StringValue, x).find(op.substr, op.start, op.end) + + +@scalar_op_compiler.register_unary_op(ops.StrExtractOp, pass_op=True) +def extract_op_impl(x: ibis_types.Value, op: ops.StrExtractOp): + return typing.cast(ibis_types.StringValue, x).re_extract(op.pat, op.n) + + +@scalar_op_compiler.register_unary_op(ops.StrSliceOp, pass_op=True) +def slice_op_impl(x: ibis_types.Value, op: ops.StrSliceOp): + return typing.cast(ibis_types.StringValue, x)[op.start : op.end] + + +@scalar_op_compiler.register_unary_op(ops.StrRepeatOp, pass_op=True) +def repeat_op_impl(x: ibis_types.Value, op: ops.StrRepeatOp): + return typing.cast(ibis_types.StringValue, x).repeat(op.repeats) + + +## Datetime Ops +@scalar_op_compiler.register_unary_op(ops.day_op) +def day_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).day().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.date_op) +def date_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).date() + + +@scalar_op_compiler.register_unary_op(ops.iso_day_op) +def iso_day_op_impl(x: ibis_types.Value): + # Plus 1 because iso day of week uses 1-based indexing + return dayofweek_op_impl(x) + 1 + + +@scalar_op_compiler.register_unary_op(ops.iso_week_op) +def iso_week_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).week_of_year() + + +@scalar_op_compiler.register_unary_op(ops.iso_year_op) +def iso_year_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).iso_year() + + +@scalar_op_compiler.register_unary_op(ops.dayofweek_op) +def dayofweek_op_impl(x: ibis_types.Value): + return ( + typing.cast(ibis_types.TimestampValue, x) + .day_of_week.index() + .cast(ibis_dtypes.int64) + ) + + +@scalar_op_compiler.register_unary_op(ops.dayofyear_op) +def dayofyear_op_impl(x: ibis_types.Value): + return ( + typing.cast(ibis_types.TimestampValue, x).day_of_year().cast(ibis_dtypes.int64) + ) + + +@scalar_op_compiler.register_unary_op(ops.hour_op) +def hour_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).hour().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.minute_op) +def minute_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).minute().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.month_op) +def month_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).month().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.quarter_op) +def quarter_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).quarter().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.second_op) +def second_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).second().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.StrftimeOp, pass_op=True) +def strftime_op_impl(x: ibis_types.Value, op: ops.StrftimeOp): + return ( + typing.cast(ibis_types.TimestampValue, x) + .strftime(op.date_format) + .cast(ibis_dtypes.str) + ) + + +@scalar_op_compiler.register_unary_op(ops.UnixSeconds) +def unix_seconds_op_impl(x: ibis_types.TimestampValue): + return x.epoch_seconds() + + +@scalar_op_compiler.register_unary_op(ops.UnixMicros) +def unix_micros_op_impl(x: ibis_types.TimestampValue): + return unix_micros(x) + + +@scalar_op_compiler.register_unary_op(ops.UnixMillis) +def unix_millis_op_impl(x: ibis_types.TimestampValue): + return unix_millis(x) + + +@scalar_op_compiler.register_binary_op(ops.timestamp_diff_op) +def timestamp_diff_op_impl(x: ibis_types.TimestampValue, y: ibis_types.TimestampValue): + return x.delta(y, "microsecond") + + +@scalar_op_compiler.register_binary_op(ops.timestamp_add_op) +def timestamp_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x + y.to_interval("us") + + +@scalar_op_compiler.register_binary_op(ops.timestamp_sub_op) +def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x - y.to_interval("us") + + +@scalar_op_compiler.register_binary_op(ops.date_diff_op) +def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue): + return x.delta(y, "day") * int(UNIT_TO_US_CONVERSION_FACTORS["d"]) # type: ignore + + +@scalar_op_compiler.register_binary_op(ops.date_add_op) +def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast(ibis_dtypes.timestamp()) + y.to_interval("us") # type: ignore + + +@scalar_op_compiler.register_binary_op(ops.date_sub_op) +def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue): + return x.cast(ibis_dtypes.timestamp()) - y.to_interval("us") # type: ignore + + +@scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) +def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): + supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] + pandas_to_ibis_freqs = {"min": "m"} + if op.freq not in supported_freqs: + raise NotImplementedError( + f"Unsupported freq paramater: {op.freq}" + + " Supported freq parameters are: " + + ",".join(supported_freqs) + ) + if op.freq in pandas_to_ibis_freqs: + ibis_freq = pandas_to_ibis_freqs[op.freq] + else: + ibis_freq = op.freq + result_type = x.type() + result = typing.cast(ibis_types.TimestampValue, x) + result = result.truncate(ibis_freq) # type: ignore + return result.cast(result_type) + + +@scalar_op_compiler.register_binary_op(ops.DatetimeToIntegerLabelOp, pass_op=True) +def datetime_to_integer_label_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp +): + # Determine if the frequency is fixed by checking if 'op.freq.nanos' is defined. + try: + return datetime_to_integer_label_fixed_frequency(x, y, op) + except ValueError: + return datetime_to_integer_label_non_fixed_frequency(x, y, op) + + +def datetime_to_integer_label_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp +): + """ + This function handles fixed frequency conversions where the unit can range + from microseconds (us) to days. + """ + us = op.freq.nanos / 1000 + x_int = x.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + first = calculate_resample_first(y, op.origin) + x_int_label = (x_int - first) // us + return x_int_label + + +def datetime_to_integer_label_non_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.DatetimeToIntegerLabelOp +): + """ + This function handles non-fixed frequency conversions for units ranging + from weeks to years. + """ + rule_code = op.freq.rule_code + n = op.freq.n + if rule_code == "W-SUN": # Weekly + us = n * 7 * 24 * 60 * 60 * 1000000 + x = x.truncate("week") + ibis_api.interval(days=6) # type: ignore + y = y.truncate("week") + ibis_api.interval(days=6) # type: ignore + x_int = x.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + first = y.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + x_int_label = ( + ibis_api.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // us + 1) # type: ignore + .end() + ) + elif rule_code == "ME": # Monthly + x_int = x.year() * 12 + x.month() - 1 # type: ignore + first = y.year() * 12 + y.month() - 1 # type: ignore + x_int_label = ( + ibis_api.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // n + 1) # type: ignore + .end() + ) + elif rule_code == "QE-DEC": # Quarterly + x_int = x.year() * 4 + x.quarter() - 1 # type: ignore + first = y.year() * 4 + y.quarter() - 1 # type: ignore + x_int_label = ( + ibis_api.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // n + 1) # type: ignore + .end() + ) + elif rule_code == "YE-DEC": # Yearly + x_int = x.year() # type: ignore + first = y.year() # type: ignore + x_int_label = ( + ibis_api.case() + .when(x_int == first, 0) + .else_((x_int - first - 1) // n + 1) # type: ignore + .end() + ) + else: + raise ValueError(rule_code) + return x_int_label + + +@scalar_op_compiler.register_binary_op(ops.IntegerLabelToDatetimeOp, pass_op=True) +def integer_label_to_datetime_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp +): + # Determine if the frequency is fixed by checking if 'op.freq.nanos' is defined. + try: + return integer_label_to_datetime_op_fixed_frequency(x, y, op) + except ValueError: + return integer_label_to_datetime_op_non_fixed_frequency(x, y, op) + + +def integer_label_to_datetime_op_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp +): + """ + This function handles fixed frequency conversions where the unit can range + from microseconds (us) to days. + """ + us = op.freq.nanos / 1000 + + first = calculate_resample_first(y, op.origin) + + x_label = ( + (x * us + first) # type: ignore + .cast(ibis_dtypes.int64) + .to_timestamp(unit="us") + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(y.type()) + ) + return x_label + + +def integer_label_to_datetime_op_non_fixed_frequency( + x: ibis_types.Value, y: ibis_types.Value, op: ops.IntegerLabelToDatetimeOp +): + """ + This function handles non-fixed frequency conversions for units ranging + from weeks to years. + """ + rule_code = op.freq.rule_code + n = op.freq.n + if rule_code == "W-SUN": # Weekly + us = n * 7 * 24 * 60 * 60 * 1000000 + first = ( + y.cast(ibis_dtypes.Timestamp(timezone="UTC")).truncate("week") # type: ignore + + ibis_api.interval(days=6) + ).cast(ibis_dtypes.int64) + x_label = ( + (x * us + first) # type: ignore + .cast(ibis_dtypes.int64) + .to_timestamp(unit="us") + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(y.type()) + ) + elif rule_code == "ME": # Monthly + one = ibis_types.literal(1) + twelve = ibis_types.literal(12) + first = y.year() * twelve + y.month() - one # type: ignore + + x = x * n + first # type: ignore + year = x // twelve # type: ignore + month = (x % twelve) + one # type: ignore + + next_year = (month == twelve).ifelse(year + one, year) + next_month = (month == twelve).ifelse(one, month + one) + next_month_date = ibis_api.timestamp( + typing.cast(ibis_types.IntegerValue, next_year), + typing.cast(ibis_types.IntegerValue, next_month), + 1, + 0, + 0, + 0, + ) + x_label = next_month_date - ibis_api.interval(days=1) + elif rule_code == "QE-DEC": # Quarterly + one = ibis_types.literal(1) + three = ibis_types.literal(3) + four = ibis_types.literal(4) + twelve = ibis_types.literal(12) + first = y.year() * four + y.quarter() - one # type: ignore + + x = x * n + first # type: ignore + year = x // four # type: ignore + month = ((x % four) + one) * three # type: ignore + + next_year = (month == twelve).ifelse(year + one, year) + next_month = (month == twelve).ifelse(one, month + one) + next_month_date = ibis_api.timestamp( + typing.cast(ibis_types.IntegerValue, next_year), + typing.cast(ibis_types.IntegerValue, next_month), + 1, + 0, + 0, + 0, + ) + + x_label = next_month_date - ibis_api.interval(days=1) + elif rule_code == "YE-DEC": # Yearly + one = ibis_types.literal(1) + first = y.year() # type: ignore + x = x * n + first # type: ignore + next_year = x + one # type: ignore + next_month_date = ibis_api.timestamp( + typing.cast(ibis_types.IntegerValue, next_year), + 1, + 1, + 0, + 0, + 0, + ) + x_label = next_month_date - ibis_api.interval(days=1) + + return x_label.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(y.type()) + + +def calculate_resample_first(y: ibis_types.Value, origin): + if origin == "epoch": + return ibis_types.literal(0) + elif origin == "start_day": + return ( + y.cast(ibis_dtypes.date) + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(ibis_dtypes.int64) + ) + elif origin == "start": + return y.cast(ibis_dtypes.Timestamp(timezone="UTC")).cast(ibis_dtypes.int64) + else: + raise ValueError(f"Origin {origin} not supported") + + +@scalar_op_compiler.register_unary_op(ops.time_op) +def time_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).time() + + +@scalar_op_compiler.register_unary_op(ops.year_op) +def year_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) + + +@scalar_op_compiler.register_unary_op(ops.normalize_op) +def normalize_op_impl(x: ibis_types.Value): + result_type = x.type() + result = x.truncate("D") # type: ignore + return result.cast(result_type) + + +# Geo Ops +@scalar_op_compiler.register_unary_op(ops.geo_area_op) +def geo_area_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).area() + + +@scalar_op_compiler.register_unary_op(ops.geo_st_astext_op) +def geo_st_astext_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).as_text() + + +@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) +def geo_st_boundary_op_impl(x: ibis_types.Value): + return st_boundary(x) + + +@scalar_op_compiler.register_binary_op(ops.geo_st_difference_op, pass_op=False) +def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).difference( + typing.cast(ibis_types.GeoSpatialValue, y) + ) + + +@scalar_op_compiler.register_binary_op(ops.GeoStDistanceOp, pass_op=True) +def geo_st_distance_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.GeoStDistanceOp +): + return st_distance(x, y, op.use_spheroid) + + +@scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op) +def geo_st_geogfromtext_op_impl(x: ibis_types.Value): + # Ibis doesn't seem to provide a dedicated method to cast from string to geography, + # so we use a BigQuery scalar function, st_geogfromtext(), directly. + return st_geogfromtext(x) + + +@scalar_op_compiler.register_binary_op(ops.geo_st_geogpoint_op, pass_op=False) +def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).point( + typing.cast(ibis_types.NumericValue, y) + ) + + +@scalar_op_compiler.register_binary_op(ops.geo_st_intersection_op, pass_op=False) +def geo_st_intersection_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).intersection( + typing.cast(ibis_types.GeoSpatialValue, y) + ) + + +@scalar_op_compiler.register_unary_op(ops.geo_st_isclosed_op, pass_op=False) +def geo_st_isclosed_op_impl(x: ibis_types.Value): + return st_isclosed(x) + + +@scalar_op_compiler.register_unary_op(ops.geo_x_op) +def geo_x_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).x() + + +@scalar_op_compiler.register_unary_op(ops.GeoStLengthOp, pass_op=True) +def geo_length_op_impl(x: ibis_types.Value, op: ops.GeoStLengthOp): + # Call the st_length UDF defined in this file (or imported) + return st_length(x, op.use_spheroid) + + +@scalar_op_compiler.register_unary_op(ops.geo_y_op) +def geo_y_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).y() + + +# Parameterized ops +@scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) +def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): + struct_value = typing.cast(ibis_types.StructValue, x) + if isinstance(op.name_or_index, str): + name = op.name_or_index + else: + name = struct_value.names[op.name_or_index] + + result = struct_value[name] + return result.cast(result.type()(nullable=True)).name(name) + + +def numeric_to_datetime( + x: ibis_types.Value, unit: str, safe: bool = False +) -> ibis_types.TimestampValue: + if not isinstance(x, ibis_types.IntegerValue) and not isinstance( + x, ibis_types.FloatingValue + ): + raise TypeError("Non-numerical types are not supposed to reach this function.") + + if unit not in UNIT_TO_US_CONVERSION_FACTORS: + raise ValueError(f"Cannot convert input with unit '{unit}'.") + x_converted = x * typing.cast( + ibis_types.IntegerValue, UNIT_TO_US_CONVERSION_FACTORS[unit] + ) + x_converted = ( + x_converted.try_cast(ibis_dtypes.int64) # type: ignore + if safe + else x_converted.cast(ibis_dtypes.int64) + ) + + # Note: Due to an issue where casting directly to a timestamp + # without a timezone does not work, we first cast to UTC. This + # approach appears to bypass a potential bug in Ibis's cast function, + # allowing for subsequent casting to a timestamp type without timezone + # information. Further investigation is needed to confirm this behavior. + return x_converted.to_timestamp(unit="us").cast( # type: ignore + ibis_dtypes.Timestamp(timezone="UTC") + ) + + +@scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True) +def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): + to_type = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( + op.to_type + ) + if isinstance(x, ibis_types.NullScalar): + return ibis_types.null().cast(to_type) + + # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first. + if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp: + utc_time_type = ibis_dtypes.Timestamp(timezone="UTC") + x_converted = x.try_cast(utc_time_type) if op.safe else x.cast(utc_time_type) + return bigframes.core.compile.ibis_types.cast_ibis_value( + x_converted, to_type, safe=op.safe + ) + + if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time: + # The conversion unit is set to "us" (microseconds) for consistency + # with pandas converting time64[us][pyarrow] to int64[pyarrow]. + return x.delta(ibis_api.time("00:00:00"), part="microsecond") # type: ignore + + if x.type() == ibis_dtypes.int64: + # The conversion unit is set to "us" (microseconds) for consistency + # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], + # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. + unit = "us" + x_converted = numeric_to_datetime(x, unit, safe=op.safe) + if to_type == ibis_dtypes.timestamp: + return ( + x_converted.try_cast(ibis_dtypes.Timestamp()) + if op.safe + else x_converted.cast(ibis_dtypes.Timestamp()) + ) + elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): + return x_converted + elif to_type == ibis_dtypes.time: + return x_converted.time() + + if to_type == ibis_dtypes.json: + if x.type() == ibis_dtypes.string: + return parse_json_in_safe(x) if op.safe else parse_json(x) + if x.type() == ibis_dtypes.bool: + x_bool = typing.cast( + ibis_types.StringValue, + bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ), + ).lower() + return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) + if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): + x_str = bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ) + return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) + + if x.type() == ibis_dtypes.json: + if to_type == ibis_dtypes.int64: + return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) + if to_type == ibis_dtypes.float64: + return ( + cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) + ) + if to_type == ibis_dtypes.bool: + return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) + if to_type == ibis_dtypes.string: + return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) + + # TODO: either inline this function, or push rest of this op into the function + return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) + + +@scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True) +def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): + contains_nulls = any(is_null(value) for value in op.values) + matchable_ibis_values = [] + for item in op.values: + if not is_null(item): + try: + # we want values that *could* be cast to the dtype, but we don't want + # to actually cast it, as that could be lossy (eg float -> int) + item_inferred_type = ibis_types.literal(item).type() + if ( + x.type() == item_inferred_type + or x.type().is_numeric() + and item_inferred_type.is_numeric() + ): + matchable_ibis_values.append(item) + except TypeError: + pass + + if op.match_nulls and contains_nulls: + return x.isnull() | x.isin(matchable_ibis_values) + else: + return x.isin(matchable_ibis_values) + + +@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) +def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): + if x.type() == ibis_dtypes.str: + return x.try_cast(ibis_dtypes.Timestamp(None)) # type: ignore + else: + # Numerical inputs. + if op.format: + x = x.cast(ibis_dtypes.str).to_timestamp(op.format) # type: ignore + else: + # The default unit is set to "ns" (nanoseconds) for consistency + # with pandas, where "ns" is the default unit for datetime operations. + unit = op.unit or "ns" + x = numeric_to_datetime(x, unit) + + return x.cast(ibis_dtypes.Timestamp(None)) # type: ignore + + +@scalar_op_compiler.register_unary_op(ops.ToTimestampOp, pass_op=True) +def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp): + if x.type() == ibis_dtypes.str: + x = ( + typing.cast(ibis_types.StringValue, x).to_timestamp(op.format) + if op.format + else timestamp(x) + ) + else: + # Numerical inputs. + if op.format: + x = x.cast(ibis_dtypes.str).to_timestamp(op.format) # type: ignore + else: + # The default unit is set to "ns" (nanoseconds) for consistency + # with pandas, where "ns" is the default unit for datetime operations. + unit = op.unit or "ns" + x = numeric_to_datetime(x, unit) + + return x.cast(ibis_dtypes.Timestamp(timezone="UTC")) + + +@scalar_op_compiler.register_unary_op(ops.ToTimedeltaOp, pass_op=True) +def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): + return ( + typing.cast(ibis_types.NumericValue, x) * UNIT_TO_US_CONVERSION_FACTORS[op.unit] # type: ignore + ).floor() + + +@scalar_op_compiler.register_unary_op(ops.timedelta_floor_op) +def timedelta_floor_op_impl(x: ibis_types.NumericValue): + return x.floor() + + +@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) +def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): + udf_sig = op.function_def.signature + ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + + @ibis_udf.scalar.builtin( + name=str(op.function_def.routine_ref), signature=ibis_py_sig + ) + def udf(input): + ... + + x_transformed = udf(x) + if not op.apply_on_null: + return ibis_api.case().when(x.isnull(), x).else_(x_transformed).end() + return x_transformed + + +@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True) +def binary_remote_function_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp +): + udf_sig = op.function_def.signature + ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + + @ibis_udf.scalar.builtin( + name=str(op.function_def.routine_ref), signature=ibis_py_sig + ) + def udf(input1, input2): + ... + + x_transformed = udf(x, y) + return x_transformed + + +@scalar_op_compiler.register_nary_op(ops.NaryRemoteFunctionOp, pass_op=True) +def nary_remote_function_op_impl( + *operands: ibis_types.Value, op: ops.NaryRemoteFunctionOp +): + udf_sig = op.function_def.signature + ibis_py_sig = (udf_sig.py_input_types, udf_sig.py_output_type) + arg_names = tuple(arg.name for arg in udf_sig.input_types) + + @ibis_udf.scalar.builtin( + name=str(op.function_def.routine_ref), + signature=ibis_py_sig, + param_name_overrides=arg_names, + ) + def udf(*inputs): + ... + + result = udf(*operands) + return result + + +@scalar_op_compiler.register_unary_op(ops.MapOp, pass_op=True) +def map_op_impl(x: ibis_types.Value, op: ops.MapOp): + case = ibis_api.case() + for mapping in op.mappings: + case = case.when(x == mapping[0], mapping[1]) + return case.else_(x).end() + + +# Array Ops +@scalar_op_compiler.register_unary_op(ops.ArrayToStringOp, pass_op=True) +def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): + return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter) + + +@scalar_op_compiler.register_unary_op(ops.ArrayIndexOp, pass_op=True) +def array_index_op_impl(x: ibis_types.Value, op: ops.ArrayIndexOp): + res = typing.cast(ibis_types.ArrayValue, x)[op.index] + if x.type().is_string(): + return _null_or_value(res, res != ibis_types.literal("")) + else: + return res + + +@scalar_op_compiler.register_unary_op(ops.ArraySliceOp, pass_op=True) +def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp): + res = typing.cast(ibis_types.ArrayValue, x)[op.start : op.stop : op.step] + if x.type().is_string(): + return _null_or_value(res, res != ibis_types.literal("")) + else: + return res + + +# JSON Ops +@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) +def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): + return json_set(json_obj=x, json_path=op.json_path, json_value=y) + + +@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True) +def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_extract.__annotations__["return"] = return_type + json_extract_op = ibis_udf.scalar.builtin(json_extract) + return json_extract_op(json_or_json_string=x, json_path=op.json_path) + + +@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True) +def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_extract_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_extract_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore + json_extract_op = ibis_udf.scalar.builtin(json_extract_array) + return json_extract_op(json_or_json_string=x, json_path=op.json_path) + + +@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True) +def json_extract_string_array_op_impl( + x: ibis_types.Value, op: ops.JSONExtractStringArray +): + return json_extract_string_array(json_obj=x, json_path=op.json_path) + + +@scalar_op_compiler.register_unary_op(ops.JSONQuery, pass_op=True) +def json_query_op_impl(x: ibis_types.Value, op: ops.JSONQuery): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_query.__annotations__["return"] = return_type + json_query_op = ibis_udf.scalar.builtin(json_query) + return json_query_op(json_or_json_string=x, json_path=op.json_path) + + +@scalar_op_compiler.register_unary_op(ops.JSONQueryArray, pass_op=True) +def json_query_array_op_impl(x: ibis_types.Value, op: ops.JSONQueryArray): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_query_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_query_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore + json_query_op = ibis_udf.scalar.builtin(json_query_array) + return json_query_op(json_or_json_string=x, json_path=op.json_path) + + +@scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True) +def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): + return parse_json(json_str=x) + + +@scalar_op_compiler.register_unary_op(ops.ToJSONString) +def to_json_string_op_impl(json_obj: ibis_types.Value): + return to_json_string(json_obj=json_obj) + + +@scalar_op_compiler.register_unary_op(ops.JSONValue, pass_op=True) +def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue): + return json_value(json_obj=x, json_path=op.json_path) + + +@scalar_op_compiler.register_unary_op(ops.JSONValueArray, pass_op=True) +def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray): + return json_value_array(json_obj=x, json_path=op.json_path) + + +# Blob Ops +@scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op) +def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value): + return obj_fetch_metadata(obj_ref=obj_ref) + + +@scalar_op_compiler.register_unary_op(ops.ObjGetAccessUrl, pass_op=True) +def obj_get_access_url_op_impl(obj_ref: ibis_types.Value, op: ops.ObjGetAccessUrl): + return obj_get_access_url(obj_ref=obj_ref, mode=op.mode) + + +### Binary Ops +def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): + """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" + + def short_circuit_nulls_inner(binop): + @functools.wraps(binop) + def wrapped_binop(x: ibis_types.Value, y: ibis_types.Value): + if isinstance(x, ibis_types.NullScalar): + return ibis_types.null().cast(type_override or y.type()) + elif isinstance(y, ibis_types.NullScalar): + return ibis_types.null().cast(type_override or x.type()) + else: + return binop(x, y) + + return wrapped_binop + + return short_circuit_nulls_inner + + +@scalar_op_compiler.register_binary_op(ops.strconcat_op) +def concat_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x_string = typing.cast(ibis_types.StringValue, x) + y_string = typing.cast(ibis_types.StringValue, y) + return x_string.concat(y_string) + + +@scalar_op_compiler.register_binary_op(ops.eq_op) +def eq_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x, y = _coerce_comparables(x, y) + return x == y + + +@scalar_op_compiler.register_binary_op(ops.eq_null_match_op) +def eq_nulls_match_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" + x, y = _coerce_comparables(x, y) + literal = ibis_types.literal("$NULL_SENTINEL$") + if hasattr(x, "fill_null"): + left = x.cast(ibis_dtypes.str).fill_null(literal) + right = y.cast(ibis_dtypes.str).fill_null(literal) + else: + left = x.cast(ibis_dtypes.str).fillna(literal) + right = y.cast(ibis_dtypes.str).fillna(literal) + + return left == right + + +@scalar_op_compiler.register_binary_op(ops.ne_op) +def ne_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x, y = _coerce_comparables(x, y) + return x != y + + +def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): + return ibis_api.ifelse( + where_value, + value, + ibis_types.null(), + ) + + +def _coerce_comparables( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.type().is_boolean() and not y.type().is_boolean(): + x = x.cast(ibis_dtypes.int64) + elif y.type().is_boolean() and not x.type().is_boolean(): + y = y.cast(ibis_dtypes.int64) + return x, y + + +@scalar_op_compiler.register_binary_op(ops.and_op) +def and_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For AND, when we encounter a + # NULL value, we only know when the result is FALSE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis_types.literal(False)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis_types.literal(False)) + return typing.cast(ibis_types.BooleanValue, x) & typing.cast( + ibis_types.BooleanValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.or_op) +def or_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For OR, when we encounter a + # NULL value, we only know when the result is TRUE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis_types.literal(True)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis_types.literal(True)) + return typing.cast(ibis_types.BooleanValue, x) | typing.cast( + ibis_types.BooleanValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.xor_op) +def xor_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.BooleanValue, x) ^ typing.cast( + ibis_types.BooleanValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.add_op) +@short_circuit_nulls() +def add_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): + return ibis_types.null() + return x + y # type: ignore + + +@scalar_op_compiler.register_binary_op(ops.sub_op) +@short_circuit_nulls() +def sub_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) - typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.mul_op) +@short_circuit_nulls() +def mul_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) * typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.div_op) +@short_circuit_nulls(ibis_dtypes.float) +def div_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + return typing.cast(ibis_types.NumericValue, x) / typing.cast( + ibis_types.NumericValue, y + ) + + +@scalar_op_compiler.register_binary_op(ops.pow_op) +@short_circuit_nulls(ibis_dtypes.float) +def pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.type().is_integer() and y.type().is_integer(): + return _int_pow_op(x, y) + else: + return _float_pow_op(x, y) + + +@scalar_op_compiler.register_binary_op(ops.unsafe_pow_op) +@short_circuit_nulls(ibis_dtypes.float) +def unsafe_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """For internal use only - where domain and overflow checks are not needed.""" + return typing.cast(ibis_types.NumericValue, x) ** typing.cast( + ibis_types.NumericValue, y + ) + + +def _int_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Need to avoid any error cases - should produce NaN instead + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_as_decimal = typing.cast( + ibis_types.NumericValue, + x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)), + ) + y_val = typing.cast(ibis_types.NumericValue, y) + + # BQ POW() function outputs FLOAT64, which can lose precision. + # Therefore, we do math in NUMERIC and cast back down after. + # Also, explicit bounds checks, pandas will silently overflow. + pow_result = x_as_decimal**y_val + overflow_cond = (pow_result > _ibis_num((2**63) - 1)) | ( + pow_result < _ibis_num(-(2**63)) + ) + + return ( + ibis_api.case() + .when((overflow_cond), ibis_types.null()) + .else_(pow_result.cast(ibis_dtypes.int64)) + .end() + ) + + +def _float_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_val = typing.cast(ibis_types.NumericValue, x) + y_val = typing.cast(ibis_types.NumericValue, y) + + overflow_cond = (x_val != _ZERO) & ((y_val * x_val.abs().ln()) > _FLOAT64_EXP_BOUND) + + # Float64 lose integer precision beyond 2**53, beyond this insufficient precision to get parity + exp_too_big = y_val.abs() > _ibis_num(2**53) + # Treat very large exponents as +=INF + norm_exp = exp_too_big.ifelse(_INF * y_val.sign(), y_val) + + pow_result = x_val**norm_exp + + # This cast is dangerous, need to only excuted where y_val has been bounds-checked + # Ibis needs try_cast binding to bq safe_cast + exponent_is_whole = y_val.cast(ibis_dtypes.int64) == y_val + odd_exponent = (x_val < _ZERO) & ( + y_val.cast(ibis_dtypes.int64) % _ibis_num(2) == _ibis_num(1) + ) + infinite_base = x_val.abs() == _INF + + return ( + ibis_api.case() + # Might be able to do something more clever with x_val==0 case + .when(y_val == _ZERO, _ibis_num(1)) + .when( + x_val == _ibis_num(1), _ibis_num(1) + ) # Need to ignore exponent, even if it is NA + .when( + (x_val == _ZERO) & (y_val < _ZERO), _INF + ) # This case would error POW function in BQ + .when(infinite_base, pow_result) + .when( + exp_too_big, pow_result + ) # Bigquery can actually handle the +-inf cases gracefully + .when((x_val < _ZERO) & (~exponent_is_whole), _NAN) + .when( + overflow_cond, _INF * odd_exponent.ifelse(_ibis_num(-1), _ibis_num(1)) + ) # finite overflows would cause bq to error + .else_(pow_result) + .end() + ) + + +@scalar_op_compiler.register_binary_op(ops.lt_op) +@short_circuit_nulls(ibis_dtypes.bool) +def lt_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x, y = _coerce_comparables(x, y) + return x < y + + +@scalar_op_compiler.register_binary_op(ops.le_op) +@short_circuit_nulls(ibis_dtypes.bool) +def le_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x, y = _coerce_comparables(x, y) + return x <= y + + +@scalar_op_compiler.register_binary_op(ops.gt_op) +@short_circuit_nulls(ibis_dtypes.bool) +def gt_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x, y = _coerce_comparables(x, y) + return x > y + + +@scalar_op_compiler.register_binary_op(ops.ge_op) +@short_circuit_nulls(ibis_dtypes.bool) +def ge_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x, y = _coerce_comparables(x, y) + return x >= y + + +@scalar_op_compiler.register_binary_op(ops.floordiv_op) +@short_circuit_nulls(ibis_dtypes.int) +def floordiv_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + x_numeric = typing.cast(ibis_types.NumericValue, x) + y_numeric = typing.cast(ibis_types.NumericValue, y) + floordiv_expr = x_numeric // y_numeric + + # DIV(N, 0) will error in bigquery, but needs to return 0 for int, and inf for float in BQ so we short-circuit in this case. + # Multiplying left by zero propogates nulls. + zero_result = _INF if (x.type().is_floating() or y.type().is_floating()) else _ZERO + return ( + ibis_api.case() + .when(y_numeric == _ZERO, zero_result * x_numeric) + .else_(floordiv_expr) + .end() + ) + + +def _is_bignumeric(x: ibis_types.Value): + if not isinstance(x, ibis_types.DecimalValue): + return False + # Should be exactly 76 for bignumeric + return x.precision > 70 # type: ignore + + +def _is_numeric(x: ibis_types.Value): + # either big-numeric or numeric + return isinstance(x, ibis_types.DecimalValue) + + +@scalar_op_compiler.register_binary_op(ops.mod_op) +@short_circuit_nulls() +def mod_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. + op = y.op() + if isinstance(op, ibis_generic.Literal) and op.value == 0: + return ibis_types.null().cast(x.type()) + + if x.type().is_integer() and y.type().is_integer(): + # both are ints, no casting necessary + return _int_mod( + typing.cast(ibis_types.IntegerValue, x), + typing.cast(ibis_types.IntegerValue, y), + ) + + else: + # bigquery doens't support float mod, so just cast to bignumeric and hope for the best + x_numeric = typing.cast( + ibis_types.DecimalValue, + x.cast(ibis_dtypes.Decimal(precision=76, scale=38, nullable=True)), + ) + y_numeric = typing.cast( + ibis_types.DecimalValue, + y.cast(ibis_dtypes.Decimal(precision=76, scale=38, nullable=True)), + ) + mod_numeric = _bignumeric_mod(x_numeric, y_numeric) # type: ignore + + # Cast back down based on original types + if _is_bignumeric(x) or _is_bignumeric(y): + return mod_numeric + if _is_numeric(x) or _is_numeric(y): + return mod_numeric.cast(ibis_dtypes.Decimal(38, 9)) + else: + return mod_numeric.cast(ibis_dtypes.float64) + + +def _bignumeric_mod( + x: ibis_types.IntegerValue, + y: ibis_types.IntegerValue, +): + # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. + op = y.op() + if isinstance(op, ibis_generic.Literal) and op.value == 0: + return ibis_types.null().cast(x.type()) + + bq_mod = x % y # Bigquery will maintain x sign here + + # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) + return ( + ibis_api.case() + .when( + y == _ZERO, + _NAN * x, + ) # Dummy op to propogate nulls and type from x arg + .when( + (y < _ZERO) & (bq_mod > _ZERO), (y + bq_mod) + ) # Convert positive result to negative + .when( + (y > _ZERO) & (bq_mod < _ZERO), (y + bq_mod) + ) # Convert negative result to positive + .else_(bq_mod) + .end() + ) + + +def _int_mod( + x: ibis_types.IntegerValue, + y: ibis_types.IntegerValue, +): + # Hacky short-circuit to avoid passing zero-literal to sql backend, evaluate locally instead to null. + op = y.op() + if isinstance(op, ibis_generic.Literal) and op.value == 0: + return ibis_types.null().cast(x.type()) + + bq_mod = x % y # Bigquery will maintain x sign here + + # In BigQuery returned value has the same sign as X. In pandas, the sign of y is used, so we need to flip the result if sign(x) != sign(y) + return ( + ibis_api.case() + .when( + y == _ZERO, + _ZERO * x, + ) # Dummy op to propogate nulls and type from x arg + .when( + (y < _ZERO) & (bq_mod > _ZERO), (y + bq_mod) + ) # Convert positive result to negative + .when( + (y > _ZERO) & (bq_mod < _ZERO), (y + bq_mod) + ) # Convert negative result to positive + .else_(bq_mod) + .end() + ) + + +@scalar_op_compiler.register_binary_op(ops.fillna_op) +def fillna_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if hasattr(x, "fill_null"): + return x.fill_null(typing.cast(ibis_types.Scalar, y)) + else: + return x.fillna(typing.cast(ibis_types.Scalar, y)) + + +@scalar_op_compiler.register_binary_op(ops.round_op) +def round_op(x: ibis_types.Value, y: ibis_types.Value): + if x.type().is_integer(): + # bq produces float64, but pandas returns int + return ( + typing.cast(ibis_types.NumericValue, x) + .round(digits=typing.cast(ibis_types.IntegerValue, y)) + .cast(ibis_dtypes.int64) + ) + return typing.cast(ibis_types.NumericValue, x).round( + digits=typing.cast(ibis_types.IntegerValue, y) + ) + + +@scalar_op_compiler.register_binary_op(ops.coalesce_op) +def coalesce_impl( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.name("name").equals(y.name("name")): + return x + else: + return ibis_api.coalesce(x, y) + + +@scalar_op_compiler.register_binary_op(ops.maximum_op) +def maximum_impl( + value: ibis_types.Value, + lower: ibis_types.Value, +): + # Note: propagates nulls + return ( + ibis_api.case().when(lower.isnull() | (value < lower), lower).else_(value).end() + ) + + +@scalar_op_compiler.register_binary_op(ops.minimum_op) +def minimum_impl( + value: ibis_types.Value, + upper: ibis_types.Value, +): + # Note: propagates nulls + return ( + ibis_api.case().when(upper.isnull() | (value > upper), upper).else_(value).end() + ) + + +@scalar_op_compiler.register_binary_op(ops.cosine_distance_op) +def cosine_distance_impl( + vector1: ibis_types.Value, + vector2: ibis_types.Value, +): + return vector_distance(vector1, vector2, "COSINE") + + +@scalar_op_compiler.register_binary_op(ops.euclidean_distance_op) +def euclidean_distance_impl( + vector1: ibis_types.Value, + vector2: ibis_types.Value, +): + return vector_distance(vector1, vector2, "EUCLIDEAN") + + +@scalar_op_compiler.register_binary_op(ops.manhattan_distance_op) +def manhattan_distance_impl( + vector1: ibis_types.Value, + vector2: ibis_types.Value, +): + return vector_distance(vector1, vector2, "MANHATTAN") + + +# Blob Ops +@scalar_op_compiler.register_binary_op(ops.obj_make_ref_op) +def obj_make_ref_op(x: ibis_types.Value, y: ibis_types.Value): + return obj_make_ref(uri=x, authorizer=y) + + +# Ternary Operations +@scalar_op_compiler.register_ternary_op(ops.where_op) +def where_op( + original: ibis_types.Value, + condition: ibis_types.Value, + replacement: ibis_types.Value, +) -> ibis_types.Value: + """Returns x if y is true, otherwise returns z.""" + return ibis_api.case().when(condition, original).else_(replacement).end() # type: ignore + + +@scalar_op_compiler.register_ternary_op(ops.clip_op) +def clip_op( + original: ibis_types.Value, + lower: ibis_types.Value, + upper: ibis_types.Value, +) -> ibis_types.Value: + """Clips value to lower and upper bounds.""" + if isinstance(lower, ibis_types.NullScalar) and ( + not isinstance(upper, ibis_types.NullScalar) + ): + return ibis_api.least(original, upper) + elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance( + upper, ibis_types.NullScalar + ): + return ibis_api.greatest(original, lower) + elif isinstance(lower, ibis_types.NullScalar) and ( + isinstance(upper, ibis_types.NullScalar) + ): + return original + else: + # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound + return ibis_api.greatest(ibis_api.least(original, upper), lower) + + +# N-ary Operations +@scalar_op_compiler.register_nary_op(ops.case_when_op) +def case_when_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value: + # ibis can handle most type coercions, but we need to force bool -> int + # TODO: dispatch coercion depending on bigframes dtype schema + result_values = cases_and_outputs[1::2] + do_upcast_bool = any(t.type().is_numeric() for t in result_values) + if do_upcast_bool: + # Just need to upcast to int, ibis can handle further coercion + result_values = tuple( + val.cast(ibis_dtypes.int64) if val.type().is_boolean() else val + for val in result_values + ) + + case_val = ibis_api.case() + for predicate, output in zip(cases_and_outputs[::2], result_values): + case_val = case_val.when(predicate, output) + return case_val.end() # type: ignore + + +@scalar_op_compiler.register_nary_op(ops.SqlScalarOp, pass_op=True) +def sql_scalar_op_impl(*operands: ibis_types.Value, op: ops.SqlScalarOp): + return ibis_generic.SqlScalar( + op.sql_template, + values=tuple(typing.cast(ibis_generic.Value, expr.op()) for expr in operands), + output_type=bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( + op.output_type() + ), + ).to_expr() + + +@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True) +def struct_op_impl( + *values: ibis_types.Value, op: ops.StructOp +) -> ibis_types.StructValue: + data = {} + for i, value in enumerate(values): + data[op.column_names[i]] = value + + return ibis_types.struct(data) + + +@scalar_op_compiler.register_nary_op(ops.RowKey, pass_op=True) +def rowkey_op_impl(*values: ibis_types.Value, op: ops.RowKey) -> ibis_types.Value: + return bigframes.core.compile.default_ordering.gen_row_key(values) + + +# Helpers +def is_null(value) -> bool: + # float NaN/inf should be treated as distinct from 'true' null values + return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) + + +@ibis_udf.scalar.builtin +def st_geogfromtext(a: str) -> ibis_dtypes.geography: # type: ignore + """Convert string to geography.""" + + +@ibis_udf.scalar.builtin +def timestamp(a: str) -> ibis_dtypes.timestamp: # type: ignore + """Convert string to timestamp.""" + + +@ibis_udf.scalar.builtin +def unix_millis(a: ibis_dtypes.timestamp) -> int: # type: ignore + """Convert a timestamp to milliseconds""" + + +@ibis_udf.scalar.builtin +def st_boundary(a: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ignore + """Find the boundary of a geography.""" + + +@ibis_udf.scalar.builtin +def st_distance(a: ibis_dtypes.geography, b: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore + """Convert string to geography.""" + + +@ibis_udf.scalar.builtin +def st_length(geog: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore + """ST_LENGTH BQ builtin. This body is never executed.""" + pass + + +@ibis_udf.scalar.builtin +def unix_micros(a: ibis_dtypes.timestamp) -> int: # type: ignore + """Convert a timestamp to microseconds""" + + +# Need these because ibis otherwise tries to do casts to int that can fail +@ibis_udf.scalar.builtin(name="floor") +def float_floor(a: float) -> float: + """Convert string to timestamp.""" + return 0 # pragma: NO COVER + + +@ibis_udf.scalar.builtin(name="ceil") +def float_ceil(a: float) -> float: + """Convert string to timestamp.""" + return 0 # pragma: NO COVER + + +@ibis_udf.scalar.builtin(name="parse_json") +def parse_json(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] + """Converts a JSON-formatted STRING value to a JSON value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.PARSE_JSON") +def parse_json_in_safe(json_str: str) -> ibis_dtypes.JSON: # type: ignore[empty-body] + """Converts a JSON-formatted STRING value to a JSON value in the safe mode.""" + + +@ibis_udf.scalar.builtin(name="json_set") +def json_set( # type: ignore[empty-body] + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String, json_value +) -> ibis_dtypes.JSON: + """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" + + +@ibis_udf.scalar.builtin(name="json_extract_string_array") +def json_extract_string_array( # type: ignore[empty-body] + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String +) -> ibis_dtypes.Array[ibis_dtypes.String]: + """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs.""" + + +@ibis_udf.scalar.builtin(name="to_json_string") +def to_json_string( # type: ignore[empty-body] + json_obj: ibis_dtypes.JSON, +) -> ibis_dtypes.String: + """Convert JSON to STRING.""" + + +@ibis_udf.scalar.builtin(name="json_value") +def json_value( # type: ignore[empty-body] + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String +) -> ibis_dtypes.String: + """Retrieve value of a JSON field as plain STRING.""" + + +@ibis_udf.scalar.builtin(name="json_value_array") +def json_value_array( # type: ignore[empty-body] + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String +) -> ibis_dtypes.Array[ibis_dtypes.String]: + """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs.""" + + +@ibis_udf.scalar.builtin(name="INT64") +def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.INT64") +def cast_json_to_int64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] + """Converts a JSON number to a SQL INT64 value in the safe mode.""" + + +@ibis_udf.scalar.builtin(name="FLOAT64") +def cast_json_to_float64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.FLOAT64") +def cast_json_to_float64_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL FLOAT64 value.""" + + +@ibis_udf.scalar.builtin(name="BOOL") +def cast_json_to_bool(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.BOOL") +def cast_json_to_bool_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Boolean: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL BOOL value.""" + + +@ibis_udf.scalar.builtin(name="STRING") +def cast_json_to_string(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + +@ibis_udf.scalar.builtin(name="SAFE.STRING") +def cast_json_to_string_in_safe(json_str: ibis_dtypes.JSON) -> ibis_dtypes.String: # type: ignore[empty-body] + """Attempts to convert a JSON value to a SQL STRING value.""" + + +@ibis_udf.scalar.builtin(name="ML.DISTANCE") +def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: # type: ignore[empty-body] + """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" + + +@ibis_udf.scalar.builtin(name="OBJ.FETCH_METADATA") +def obj_fetch_metadata(obj_ref: _OBJ_REF_IBIS_DTYPE) -> _OBJ_REF_IBIS_DTYPE: # type: ignore + """Fetch metadata from ObjectRef Struct.""" + + +@ibis_udf.scalar.builtin(name="OBJ.MAKE_REF") +def obj_make_ref(uri: str, authorizer: str) -> _OBJ_REF_IBIS_DTYPE: # type: ignore + """Make ObjectRef Struct from uri and connection.""" + + +@ibis_udf.scalar.builtin(name="OBJ.GET_ACCESS_URL") +def obj_get_access_url(obj_ref: _OBJ_REF_IBIS_DTYPE, mode: ibis_dtypes.String) -> ibis_dtypes.JSON: # type: ignore + """Get access url (as ObjectRefRumtime JSON) from ObjectRef.""" + + +@ibis_udf.scalar.builtin(name="ltrim") +def str_lstrip_op( # type: ignore[empty-body] + x: ibis_dtypes.String, to_strip: ibis_dtypes.String +) -> ibis_dtypes.String: + """Remove leading and trailing characters.""" + + +@ibis_udf.scalar.builtin +def st_isclosed(a: ibis_dtypes.geography) -> ibis_dtypes.boolean: # type: ignore + """Checks if a geography is closed.""" + + +@ibis_udf.scalar.builtin(name="rtrim") +def str_rstrip_op( # type: ignore[empty-body] + x: ibis_dtypes.String, to_strip: ibis_dtypes.String +) -> ibis_dtypes.String: + """Remove leading and trailing characters.""" + + +@ibis_udf.scalar.builtin(name="trim") +def str_strip_op( # type: ignore[empty-body] + x: ibis_dtypes.String, to_strip: ibis_dtypes.String +) -> ibis_dtypes.String: + """Remove leading and trailing characters.""" diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 7b20e430ff..ce31a740e6 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -19,15 +19,18 @@ import functools import itertools import typing -from typing import Callable, Generator, Mapping, TypeVar, Union +from typing import Callable, Generator, Mapping, TYPE_CHECKING, TypeVar, Union import pandas as pd from bigframes import dtypes from bigframes.core import field import bigframes.core.identifiers as ids -import bigframes.operations -import bigframes.operations.aggregations as agg_ops + +if TYPE_CHECKING: + # Avoid circular imports. + import bigframes.operations + import bigframes.operations.aggregations as agg_ops def const( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 247d923fe7..abf3ed26fc 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -88,6 +88,8 @@ SqlScalarOp, where_op, ) +from bigframes.operations.generic_ops.isnull_op import isnull_op +from bigframes.operations.generic_ops.notnull_op import notnull_op from bigframes.operations.geo_ops import ( geo_area_op, geo_st_astext_op, @@ -102,7 +104,6 @@ GeoStDistanceOp, GeoStLengthOp, ) -from bigframes.operations.isnull_op import isnull_op, notnull_op from bigframes.operations.json_ops import ( JSONExtract, JSONExtractArray, diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops/__init__.py similarity index 100% rename from bigframes/operations/generic_ops.py rename to bigframes/operations/generic_ops/__init__.py diff --git a/bigframes/operations/generic_ops/isnull_op.py b/bigframes/operations/generic_ops/isnull_op.py new file mode 100644 index 0000000000..ff0ed6d822 --- /dev/null +++ b/bigframes/operations/generic_ops/isnull_op.py @@ -0,0 +1,68 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +from typing import ClassVar, TYPE_CHECKING + +# Imports for Ibis compilation +from bigframes_vendored.ibis.expr import types as ibis_types + +# Direct imports from bigframes +from bigframes import dtypes +from bigframes.core.compile import scalar_op_compiler +import bigframes.core.compile.polars.compiler as polars_compiler +from bigframes.operations import base_ops + +if TYPE_CHECKING: + import polars as pl + + +@dataclasses.dataclass(frozen=True) +class IsNullOp(base_ops.UnaryOp): + name: ClassVar[str] = "isnull" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.BOOL_DTYPE + + +isnull_op = IsNullOp() + + +def _ibis_isnull_op_impl(x: ibis_types.Value): + return x.isnull() + + +scalar_op_compiler.scalar_op_compiler.register_unary_op(isnull_op)(_ibis_isnull_op_impl) + + +if hasattr(polars_compiler, "PolarsExpressionCompiler"): + + def _polars_isnull_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, op: IsNullOp, input: pl.Expr + ) -> pl.Expr: + return input.is_null() + + # TODO(https://github.com/python/mypy/issues/13040): remove `type: ignore` + # when mypy can better handle singledispatch. + polars_compiler.PolarsExpressionCompiler.compile_op.register( # type: ignore + IsNullOp, _polars_isnull_op_impl + ) + + +__all__ = [ + "IsNullOp", + "isnull_op", +] diff --git a/bigframes/operations/generic_ops/notnull_op.py b/bigframes/operations/generic_ops/notnull_op.py new file mode 100644 index 0000000000..67f405e1e0 --- /dev/null +++ b/bigframes/operations/generic_ops/notnull_op.py @@ -0,0 +1,72 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +from typing import ClassVar, TYPE_CHECKING + +# Imports for Ibis compilation +from bigframes_vendored.ibis.expr import types as ibis_types + +# Direct imports from bigframes +from bigframes import dtypes +from bigframes.core.compile import scalar_op_compiler +import bigframes.core.compile.polars.compiler as polars_compiler +from bigframes.operations import base_ops + +if TYPE_CHECKING: + import polars as pl + + +@dataclasses.dataclass(frozen=True) +class NotNullOp(base_ops.UnaryOp): + name: ClassVar[str] = "notnull" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.BOOL_DTYPE + + +notnull_op = NotNullOp() + + +def _ibis_notnull_op_impl(x: ibis_types.Value): + return x.notnull() + + +scalar_op_compiler.scalar_op_compiler.register_unary_op(notnull_op)( + _ibis_notnull_op_impl +) + + +if hasattr(polars_compiler, "PolarsExpressionCompiler"): + + def _polars_notnull_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: NotNullOp, + input: pl.Expr, + ) -> pl.Expr: + return input.is_not_null() + + # TODO(https://github.com/python/mypy/issues/13040): remove `type: ignore` + # when mypy can better handle singledispatch. + polars_compiler.PolarsExpressionCompiler.compile_op.register( # type: ignore + NotNullOp, _polars_notnull_op_impl + ) + + +__all__ = [ + "NotNullOp", + "notnull_op", +] diff --git a/bigframes/operations/isnull_op.py b/bigframes/operations/isnull_op.py deleted file mode 100644 index 0579d7ac1c..0000000000 --- a/bigframes/operations/isnull_op.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import typing - -# Direct imports from bigframes -from bigframes import dtypes -from bigframes.operations import base_ops -import bigframes.operations.type as op_typing - -# Imports for Ibis compilation -from bigframes_vendored.ibis.expr import types as ibis_types - -# Imports for Polars compilation -try: - import polars as pl -except ImportError: - # Polars is optional, error will be raised elsewhere if user tries to use it. - pass - - -# Definitions of IsNullOp and NotNullOp operations -IsNullOp = base_ops.create_unary_op( - name="isnull", - type_signature=op_typing.FixedOutputType( - lambda x: True, dtypes.BOOL_DTYPE, description="nullable" - ), -) -isnull_op = IsNullOp() - -NotNullOp = base_ops.create_unary_op( - name="notnull", - type_signature=op_typing.FixedOutputType( - lambda x: True, dtypes.BOOL_DTYPE, description="nullable" - ), -) -notnull_op = NotNullOp() - -# Ibis Scalar Op Implementations -def _ibis_isnull_op_impl(x: ibis_types.Value): - return x.isnull() - -def _ibis_notnull_op_impl(x: ibis_types.Value): - return x.notnull() - - -# Polars Expression Implementations -def _polars_isnull_op_impl(op: IsNullOp, input: pl.Expr) -> pl.Expr: - return input.is_null() - -def _polars_notnull_op_impl(op: NotNullOp, input: pl.Expr) -> pl.Expr: - return input.is_not_null() - -__all__ = [ - "IsNullOp", - "isnull_op", - "NotNullOp", - "notnull_op", - "_ibis_isnull_op_impl", - "_ibis_notnull_op_impl", - "_polars_isnull_op_impl", - "_polars_notnull_op_impl", -] From 9c177250e46bc75be2f34827cd1d26afa7994f84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 8 Jul 2025 10:06:19 -0700 Subject: [PATCH 03/18] bad merge --- bigframes/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3443b773fa..1884f0beff 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2531,7 +2531,6 @@ def _filter_rows( elif items is not None: # Behavior matches pandas 2.1+, older pandas versions would reindex block = self._block - block = self._block block, mask_id = block.apply_unary_op( self._block.index_columns[0], ops.IsInOp(values=tuple(items)) ) From 78e458568d5fcf946e14a08ae5c1e47629fc0370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 8 Jul 2025 10:09:41 -0700 Subject: [PATCH 04/18] fix local pytest --- tests/system/small/pandas/core/methods/__init__.py | 13 ------------- tests/system/small/pandas/io/__init__.py | 13 ------------- tests/system/small/pandas/io/api/__init__.py | 13 ------------- .../pandas/{core/methods => }/test_describe.py | 0 .../pandas/{io/api => }/test_read_gbq_colab.py | 0 5 files changed, 39 deletions(-) delete mode 100644 tests/system/small/pandas/core/methods/__init__.py delete mode 100644 tests/system/small/pandas/io/__init__.py delete mode 100644 tests/system/small/pandas/io/api/__init__.py rename tests/system/small/pandas/{core/methods => }/test_describe.py (100%) rename tests/system/small/pandas/{io/api => }/test_read_gbq_colab.py (100%) diff --git a/tests/system/small/pandas/core/methods/__init__.py b/tests/system/small/pandas/core/methods/__init__.py deleted file mode 100644 index 0a2669d7a2..0000000000 --- a/tests/system/small/pandas/core/methods/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/system/small/pandas/io/__init__.py b/tests/system/small/pandas/io/__init__.py deleted file mode 100644 index 0a2669d7a2..0000000000 --- a/tests/system/small/pandas/io/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/system/small/pandas/io/api/__init__.py b/tests/system/small/pandas/io/api/__init__.py deleted file mode 100644 index 0a2669d7a2..0000000000 --- a/tests/system/small/pandas/io/api/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/system/small/pandas/core/methods/test_describe.py b/tests/system/small/pandas/test_describe.py similarity index 100% rename from tests/system/small/pandas/core/methods/test_describe.py rename to tests/system/small/pandas/test_describe.py diff --git a/tests/system/small/pandas/io/api/test_read_gbq_colab.py b/tests/system/small/pandas/test_read_gbq_colab.py similarity index 100% rename from tests/system/small/pandas/io/api/test_read_gbq_colab.py rename to tests/system/small/pandas/test_read_gbq_colab.py From 53e0a3ec36c957c09e1f5c421a5359aba772cc72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 8 Jul 2025 10:27:18 -0700 Subject: [PATCH 05/18] dont construct polars compiler if no polars --- bigframes/core/compile/polars/compiler.py | 636 +++++++++++----------- 1 file changed, 321 insertions(+), 315 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 9fb4131107..21ec8f8c71 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -363,334 +363,340 @@ def compile_agg_op( f"Aggregate op {op} not yet supported in polars engine." ) + @dataclasses.dataclass(frozen=True) + class PolarsCompiler: + """ + Compiles ArrayValue to polars LazyFrame and executes. + + This feature is in development and is incomplete. + While most node types are supported, this has the following limitations: + 1. GBQ data sources not supported. + 2. Joins do not order rows correctly + 3. Incomplete scalar op support + 4. Incomplete aggregate op support + 5. Incomplete analytic op support + 6. Some complex windowing types not supported (eg. groupby + rolling) + 7. UDFs are not supported. + 8. Returned types may not be entirely consistent with BigQuery backend + 9. Some operations are not entirely lazy - sampling and somse windowing. + """ -@dataclasses.dataclass(frozen=True) -class PolarsCompiler: - """ - Compiles ArrayValue to polars LazyFrame and executes. - - This feature is in development and is incomplete. - While most node types are supported, this has the following limitations: - 1. GBQ data sources not supported. - 2. Joins do not order rows correctly - 3. Incomplete scalar op support - 4. Incomplete aggregate op support - 5. Incomplete analytic op support - 6. Some complex windowing types not supported (eg. groupby + rolling) - 7. UDFs are not supported. - 8. Returned types may not be entirely consistent with BigQuery backend - 9. Some operations are not entirely lazy - sampling and somse windowing. - """ - - expr_compiler = PolarsExpressionCompiler() - agg_compiler = PolarsAggregateCompiler() - - def compile(self, plan: nodes.BigFrameNode) -> pl.LazyFrame: - if not polars_installed: - raise ValueError( - "Polars is not installed, cannot compile to polars engine." - ) + expr_compiler = PolarsExpressionCompiler() + agg_compiler = PolarsAggregateCompiler() - # TODO: Create standard way to configure BFET -> BFET rewrites - # Polars has incomplete slice support in lazy mode - node = plan - node = bigframes.core.rewrite.column_pruning(node) - node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) - node = bigframes.core.rewrite.pull_out_window_order(node) - node = bigframes.core.rewrite.schema_binding.bind_schema_to_tree(node) - node = lowering.lower_ops_to_polars(node) - return self.compile_node(node) - - @functools.singledispatchmethod - def compile_node(self, node: nodes.BigFrameNode) -> pl.LazyFrame: - """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unrecognized node: {node}") - - @compile_node.register - def compile_readlocal(self, node: nodes.ReadLocalNode): - cols_to_read = { - scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items - } - lazy_frame = cast( - pl.DataFrame, pl.from_arrow(node.local_data_source.data) - ).lazy() - lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) - if node.offsets_col: - lazy_frame = lazy_frame.with_columns( - [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.offsets_col.sql)] - ) - return lazy_frame - - @compile_node.register - def compile_filter(self, node: nodes.FilterNode): - return self.compile_node(node.child).filter( - self.expr_compiler.compile_expression(node.predicate) - ) - - @compile_node.register - def compile_orderby(self, node: nodes.OrderByNode): - frame = self.compile_node(node.child) - if len(node.by) == 0: - # pragma: no cover - return frame - return self._sort(frame, node.by) - - def _sort( - self, frame: pl.LazyFrame, by: Sequence[ordering.OrderingExpression] - ) -> pl.LazyFrame: - sorted = frame.sort( - [self.expr_compiler.compile_expression(by.scalar_expression) for by in by], - descending=[not by.direction.is_ascending for by in by], - nulls_last=[by.na_last for by in by], - maintain_order=True, - ) - return sorted - - @compile_node.register - def compile_reversed(self, node: nodes.ReversedNode): - return self.compile_node(node.child).reverse() - - @compile_node.register - def compile_selection(self, node: nodes.SelectionNode): - return self.compile_node(node.child).select( - **{new.sql: orig.id.sql for orig, new in node.input_output_pairs} - ) - - @compile_node.register - def compile_projection(self, node: nodes.ProjectionNode): - new_cols = [] - for proj_expr, name in node.assignments: - bound_expr = ex.bind_schema_fields(proj_expr, node.child.field_by_id) - new_col = self.expr_compiler.compile_expression(bound_expr).alias(name.sql) - if bound_expr.output_type is None: - new_col = new_col.cast( - _bigframes_dtype_to_polars_dtype(bigframes.dtypes.DEFAULT_DTYPE) + def compile(self, plan: nodes.BigFrameNode) -> pl.LazyFrame: + if not polars_installed: + raise ValueError( + "Polars is not installed, cannot compile to polars engine." ) - new_cols.append(new_col) - return self.compile_node(node.child).with_columns(new_cols) - - @compile_node.register - def compile_offsets(self, node: nodes.PromoteOffsetsNode): - return self.compile_node(node.child).with_columns( - [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.col_id.sql)] - ) - - @compile_node.register - def compile_join(self, node: nodes.JoinNode): - left = self.compile_node(node.left_child) - right = self.compile_node(node.right_child) - left_on = [l_name.id.sql for l_name, _ in node.conditions] - right_on = [r_name.id.sql for _, r_name in node.conditions] - if node.type == "right": - return self._ordered_join( - right, left, "left", right_on, left_on, node.joins_nulls - ).select([id.sql for id in node.ids]) - return self._ordered_join( - left, right, node.type, left_on, right_on, node.joins_nulls - ) - - def _ordered_join( - self, - left_frame: pl.LazyFrame, - right_frame: pl.LazyFrame, - how: Literal["inner", "outer", "left", "cross"], - left_on: Sequence[str], - right_on: Sequence[str], - join_nulls: bool, - ): - if how == "right": - # seems to cause seg faults as of v1.30 for no apparent reason - raise ValueError("right join not supported") - left = left_frame.with_columns( - [ - pl.int_range(pl.len()).alias("_bf_join_l"), - ] - ) - right = right_frame.with_columns( - [ - pl.int_range(pl.len()).alias("_bf_join_r"), - ] - ) - if how != "cross": - joined = left.join( - right, - how=how, - left_on=left_on, - right_on=right_on, - # Note: join_nulls renamed to nulls_equal for polars 1.24 - join_nulls=join_nulls, # type: ignore - coalesce=False, + + # TODO: Create standard way to configure BFET -> BFET rewrites + # Polars has incomplete slice support in lazy mode + node = plan + node = bigframes.core.rewrite.column_pruning(node) + node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) + node = bigframes.core.rewrite.pull_out_window_order(node) + node = bigframes.core.rewrite.schema_binding.bind_schema_to_tree(node) + node = lowering.lower_ops_to_polars(node) + return self.compile_node(node) + + @functools.singledispatchmethod + def compile_node(self, node: nodes.BigFrameNode) -> pl.LazyFrame: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + @compile_node.register + def compile_readlocal(self, node: nodes.ReadLocalNode): + cols_to_read = { + scan_item.source_id: scan_item.id.sql + for scan_item in node.scan_list.items + } + lazy_frame = cast( + pl.DataFrame, pl.from_arrow(node.local_data_source.data) + ).lazy() + lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) + if node.offsets_col: + lazy_frame = lazy_frame.with_columns( + [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.offsets_col.sql)] + ) + return lazy_frame + + @compile_node.register + def compile_filter(self, node: nodes.FilterNode): + return self.compile_node(node.child).filter( + self.expr_compiler.compile_expression(node.predicate) ) - else: - joined = left.join(right, how=how, coalesce=False) - - join_order = ( - ["_bf_join_l", "_bf_join_r"] - if how != "right" - else ["_bf_join_r", "_bf_join_l"] - ) - return joined.sort(join_order, nulls_last=True).drop( - ["_bf_join_l", "_bf_join_r"] - ) - - @compile_node.register - def compile_concat(self, node: nodes.ConcatNode): - child_frames = [self.compile_node(child) for child in node.child_nodes] - child_frames = [ - frame.rename( - {col: id.sql for col, id in zip(frame.columns, node.output_ids)} + + @compile_node.register + def compile_orderby(self, node: nodes.OrderByNode): + frame = self.compile_node(node.child) + if len(node.by) == 0: + # pragma: no cover + return frame + return self._sort(frame, node.by) + + def _sort( + self, frame: pl.LazyFrame, by: Sequence[ordering.OrderingExpression] + ) -> pl.LazyFrame: + sorted = frame.sort( + [ + self.expr_compiler.compile_expression(by.scalar_expression) + for by in by + ], + descending=[not by.direction.is_ascending for by in by], + nulls_last=[by.na_last for by in by], + maintain_order=True, ) - for frame in child_frames - ] - df = pl.concat(child_frames) - return df - - @compile_node.register - def compile_agg(self, node: nodes.AggregateNode): - df = self.compile_node(node.child) - if node.dropna and len(node.by_column_ids) > 0: - df = df.filter( - [pl.col(ref.id.sql).is_not_null() for ref in node.by_column_ids] + return sorted + + @compile_node.register + def compile_reversed(self, node: nodes.ReversedNode): + return self.compile_node(node.child).reverse() + + @compile_node.register + def compile_selection(self, node: nodes.SelectionNode): + return self.compile_node(node.child).select( + **{new.sql: orig.id.sql for orig, new in node.input_output_pairs} ) - if node.order_by: - df = self._sort(df, node.order_by) - return self._aggregate(df, node.aggregations, node.by_column_ids) - - def _aggregate( - self, - df: pl.LazyFrame, - aggregations: Sequence[Tuple[ex.Aggregation, identifiers.ColumnId]], - grouping_keys: Tuple[ex.DerefOp, ...], - ) -> pl.LazyFrame: - # Need to materialize columns to broadcast constants - agg_inputs = [ - list( - map( - lambda x: x.alias(guid.generate_guid()), - self.agg_compiler.get_args(agg), + + @compile_node.register + def compile_projection(self, node: nodes.ProjectionNode): + new_cols = [] + for proj_expr, name in node.assignments: + bound_expr = ex.bind_schema_fields(proj_expr, node.child.field_by_id) + new_col = self.expr_compiler.compile_expression(bound_expr).alias( + name.sql ) + if bound_expr.output_type is None: + new_col = new_col.cast( + _bigframes_dtype_to_polars_dtype(bigframes.dtypes.DEFAULT_DTYPE) + ) + new_cols.append(new_col) + return self.compile_node(node.child).with_columns(new_cols) + + @compile_node.register + def compile_offsets(self, node: nodes.PromoteOffsetsNode): + return self.compile_node(node.child).with_columns( + [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.col_id.sql)] ) - for agg, _ in aggregations - ] - - df_agg_inputs = df.with_columns(itertools.chain(*agg_inputs)) - - agg_exprs = [ - self.agg_compiler.compile_agg_op( - agg.op, list(map(lambda x: x.meta.output_name(), inputs)) - ).alias(id.sql) - for (agg, id), inputs in zip(aggregations, agg_inputs) - ] - - if len(grouping_keys) > 0: - group_exprs = [pl.col(ref.id.sql) for ref in grouping_keys] - grouped_df = df_agg_inputs.group_by(group_exprs) - return grouped_df.agg(agg_exprs).sort(group_exprs, nulls_last=True) - else: - return df_agg_inputs.select(agg_exprs) - - @compile_node.register - def compile_explode(self, node: nodes.ExplodeNode): - assert node.offsets_col is None - df = self.compile_node(node.child) - cols = [pl.col(col.id.sql) for col in node.column_ids] - return df.explode(cols) - - @compile_node.register - def compile_sample(self, node: nodes.RandomSampleNode): - df = self.compile_node(node.child) - # Sample is not available on lazyframe - return df.collect().sample(fraction=node.fraction).lazy() - - @compile_node.register - def compile_window(self, node: nodes.WindowOpNode): - df = self.compile_node(node.child) - - window = node.window_spec - # Should have been handled by reweriter - assert len(window.ordering) == 0 - if window.min_periods > 0: - raise NotImplementedError("min_period not yet supported for polars engine") - - if (window.bounds is None) or (window.is_unbounded): - # polars will automatically broadcast the aggregate to the matching input rows - agg_pl = self.agg_compiler.compile_agg_expr(node.expression) - if window.grouping_keys: - agg_pl = agg_pl.over(id.id.sql for id in window.grouping_keys) - result = df.with_columns(agg_pl.alias(node.output_name.sql)) - else: # row-bounded window - window_result = self._calc_row_analytic_func( - df, node.expression, node.window_spec, node.output_name.sql + + @compile_node.register + def compile_join(self, node: nodes.JoinNode): + left = self.compile_node(node.left_child) + right = self.compile_node(node.right_child) + left_on = [l_name.id.sql for l_name, _ in node.conditions] + right_on = [r_name.id.sql for _, r_name in node.conditions] + if node.type == "right": + return self._ordered_join( + right, left, "left", right_on, left_on, node.joins_nulls + ).select([id.sql for id in node.ids]) + return self._ordered_join( + left, right, node.type, left_on, right_on, node.joins_nulls ) - result = pl.concat([df, window_result], how="horizontal") - # Probably easier just to pull this out as a rewriter - if ( - node.expression.op.skips_nulls - and not node.never_skip_nulls - and node.expression.column_references + def _ordered_join( + self, + left_frame: pl.LazyFrame, + right_frame: pl.LazyFrame, + how: Literal["inner", "outer", "left", "cross"], + left_on: Sequence[str], + right_on: Sequence[str], + join_nulls: bool, ): - nullity_expr = functools.reduce( - operator.or_, - ( - pl.col(column.sql).is_null() - for column in node.expression.column_references - ), + if how == "right": + # seems to cause seg faults as of v1.30 for no apparent reason + raise ValueError("right join not supported") + left = left_frame.with_columns( + [ + pl.int_range(pl.len()).alias("_bf_join_l"), + ] + ) + right = right_frame.with_columns( + [ + pl.int_range(pl.len()).alias("_bf_join_r"), + ] ) - result = result.with_columns( - pl.when(nullity_expr) - .then(None) - .otherwise(pl.col(node.output_name.sql)) - .alias(node.output_name.sql) + if how != "cross": + joined = left.join( + right, + how=how, + left_on=left_on, + right_on=right_on, + # Note: join_nulls renamed to nulls_equal for polars 1.24 + join_nulls=join_nulls, # type: ignore + coalesce=False, + ) + else: + joined = left.join(right, how=how, coalesce=False) + + join_order = ( + ["_bf_join_l", "_bf_join_r"] + if how != "right" + else ["_bf_join_r", "_bf_join_l"] ) - return result - - def _calc_row_analytic_func( - self, - frame: pl.LazyFrame, - agg_expr: ex.Aggregation, - window: window_spec.WindowSpec, - name: str, - ) -> pl.LazyFrame: - if not isinstance(window.bounds, window_spec.RowsWindowBounds): - raise NotImplementedError("Only row bounds supported by polars engine") - groupby = None - if len(window.grouping_keys) > 0: - groupby = [ - self.expr_compiler.compile_expression(ref) - for ref in window.grouping_keys + return joined.sort(join_order, nulls_last=True).drop( + ["_bf_join_l", "_bf_join_r"] + ) + + @compile_node.register + def compile_concat(self, node: nodes.ConcatNode): + child_frames = [self.compile_node(child) for child in node.child_nodes] + child_frames = [ + frame.rename( + {col: id.sql for col, id in zip(frame.columns, node.output_ids)} + ) + for frame in child_frames ] + df = pl.concat(child_frames) + return df + + @compile_node.register + def compile_agg(self, node: nodes.AggregateNode): + df = self.compile_node(node.child) + if node.dropna and len(node.by_column_ids) > 0: + df = df.filter( + [pl.col(ref.id.sql).is_not_null() for ref in node.by_column_ids] + ) + if node.order_by: + df = self._sort(df, node.order_by) + return self._aggregate(df, node.aggregations, node.by_column_ids) - # Polars API semi-bounded, and any grouped rolling window challenging - # https://github.com/pola-rs/polars/issues/4799 - # https://github.com/pola-rs/polars/issues/8976 - pl_agg_expr = self.agg_compiler.compile_agg_expr(agg_expr).alias(name) - index_col_name = "_bf_pl_engine_offsets" - indexed_df = frame.with_row_index(index_col_name) - # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html - period_n, offset_n = _get_period_and_offset(window.bounds) - return ( - indexed_df.rolling( - index_column=index_col_name, - period=f"{period_n}i", - offset=f"{offset_n}i" if (offset_n is not None) else None, - group_by=groupby, + def _aggregate( + self, + df: pl.LazyFrame, + aggregations: Sequence[Tuple[ex.Aggregation, identifiers.ColumnId]], + grouping_keys: Tuple[ex.DerefOp, ...], + ) -> pl.LazyFrame: + # Need to materialize columns to broadcast constants + agg_inputs = [ + list( + map( + lambda x: x.alias(guid.generate_guid()), + self.agg_compiler.get_args(agg), + ) + ) + for agg, _ in aggregations + ] + + df_agg_inputs = df.with_columns(itertools.chain(*agg_inputs)) + + agg_exprs = [ + self.agg_compiler.compile_agg_op( + agg.op, list(map(lambda x: x.meta.output_name(), inputs)) + ).alias(id.sql) + for (agg, id), inputs in zip(aggregations, agg_inputs) + ] + + if len(grouping_keys) > 0: + group_exprs = [pl.col(ref.id.sql) for ref in grouping_keys] + grouped_df = df_agg_inputs.group_by(group_exprs) + return grouped_df.agg(agg_exprs).sort(group_exprs, nulls_last=True) + else: + return df_agg_inputs.select(agg_exprs) + + @compile_node.register + def compile_explode(self, node: nodes.ExplodeNode): + assert node.offsets_col is None + df = self.compile_node(node.child) + cols = [pl.col(col.id.sql) for col in node.column_ids] + return df.explode(cols) + + @compile_node.register + def compile_sample(self, node: nodes.RandomSampleNode): + df = self.compile_node(node.child) + # Sample is not available on lazyframe + return df.collect().sample(fraction=node.fraction).lazy() + + @compile_node.register + def compile_window(self, node: nodes.WindowOpNode): + df = self.compile_node(node.child) + + window = node.window_spec + # Should have been handled by reweriter + assert len(window.ordering) == 0 + if window.min_periods > 0: + raise NotImplementedError( + "min_period not yet supported for polars engine" + ) + + if (window.bounds is None) or (window.is_unbounded): + # polars will automatically broadcast the aggregate to the matching input rows + agg_pl = self.agg_compiler.compile_agg_expr(node.expression) + if window.grouping_keys: + agg_pl = agg_pl.over(id.id.sql for id in window.grouping_keys) + result = df.with_columns(agg_pl.alias(node.output_name.sql)) + else: # row-bounded window + window_result = self._calc_row_analytic_func( + df, node.expression, node.window_spec, node.output_name.sql + ) + result = pl.concat([df, window_result], how="horizontal") + + # Probably easier just to pull this out as a rewriter + if ( + node.expression.op.skips_nulls + and not node.never_skip_nulls + and node.expression.column_references + ): + nullity_expr = functools.reduce( + operator.or_, + ( + pl.col(column.sql).is_null() + for column in node.expression.column_references + ), + ) + result = result.with_columns( + pl.when(nullity_expr) + .then(None) + .otherwise(pl.col(node.output_name.sql)) + .alias(node.output_name.sql) + ) + return result + + def _calc_row_analytic_func( + self, + frame: pl.LazyFrame, + agg_expr: ex.Aggregation, + window: window_spec.WindowSpec, + name: str, + ) -> pl.LazyFrame: + if not isinstance(window.bounds, window_spec.RowsWindowBounds): + raise NotImplementedError("Only row bounds supported by polars engine") + groupby = None + if len(window.grouping_keys) > 0: + groupby = [ + self.expr_compiler.compile_expression(ref) + for ref in window.grouping_keys + ] + + # Polars API semi-bounded, and any grouped rolling window challenging + # https://github.com/pola-rs/polars/issues/4799 + # https://github.com/pola-rs/polars/issues/8976 + pl_agg_expr = self.agg_compiler.compile_agg_expr(agg_expr).alias(name) + index_col_name = "_bf_pl_engine_offsets" + indexed_df = frame.with_row_index(index_col_name) + # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html + period_n, offset_n = _get_period_and_offset(window.bounds) + return ( + indexed_df.rolling( + index_column=index_col_name, + period=f"{period_n}i", + offset=f"{offset_n}i" if (offset_n is not None) else None, + group_by=groupby, + ) + .agg(pl_agg_expr) + .select(name) ) - .agg(pl_agg_expr) - .select(name) - ) - - -def _get_period_and_offset( - bounds: window_spec.RowsWindowBounds, -) -> tuple[int, Optional[int]]: - # fixed size window - if (bounds.start is not None) and (bounds.end is not None): - return ((bounds.end - bounds.start + 1), bounds.start - 1) - - LARGE_N = 1000000000 - if bounds.start is not None: - return (LARGE_N, bounds.start - 1) - if bounds.end is not None: - return (LARGE_N, None) - raise ValueError("Not a bounded window") + + def _get_period_and_offset( + bounds: window_spec.RowsWindowBounds, + ) -> tuple[int, Optional[int]]: + # fixed size window + if (bounds.start is not None) and (bounds.end is not None): + return ((bounds.end - bounds.start + 1), bounds.start - 1) + + LARGE_N = 1000000000 + if bounds.start is not None: + return (LARGE_N, bounds.start - 1) + if bounds.end is not None: + return (LARGE_N, None) + raise ValueError("Not a bounded window") From 9cd1fdeeb79b9c888097550c38b1203abfd91947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 14 Jul 2025 17:08:29 -0500 Subject: [PATCH 06/18] limit scope to just splitting large files --- bigframes/core/compile/__init__.py | 2 +- bigframes/core/compile/aggregate_compiler.py | 2 +- bigframes/core/compile/api.py | 6 +-- bigframes/core/compile/compiled.py | 6 +-- .../core/compile/ibis_compiler/__init__.py | 25 +++++++++ .../ibis_compiler.py} | 2 +- .../ibis_compiler/operations/__init__.py | 21 ++++++++ .../operations/generic_ops/__init__.py | 13 +++++ .../operations/generic_ops/isnull_op.py | 25 +++++++++ .../operations/generic_ops/notnull_op.py | 52 +++++++++++++++++++ .../{ => ibis_compiler}/scalar_op_compiler.py | 0 .../{ => ibis_compiler}/scalar_op_registry.py | 4 +- bigframes/core/compile/polars/__init__.py | 13 ++++- bigframes/core/compile/polars/compiler.py | 8 --- .../compile/polars/operations/__init__.py | 21 ++++++++ .../polars/operations/generic_ops/__init__.py | 13 +++++ .../operations/generic_ops/isnull_op.py | 39 ++++++++++++++ .../operations/generic_ops/notnull_op.py | 39 ++++++++++++++ bigframes/operations/generic_ops/__init__.py | 11 ---- bigframes/operations/generic_ops/isnull_op.py | 14 ----- .../operations/generic_ops/notnull_op.py | 14 ----- 21 files changed, 270 insertions(+), 60 deletions(-) create mode 100644 bigframes/core/compile/ibis_compiler/__init__.py rename bigframes/core/compile/{compiler.py => ibis_compiler/ibis_compiler.py} (99%) create mode 100644 bigframes/core/compile/ibis_compiler/operations/__init__.py create mode 100644 bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py create mode 100644 bigframes/core/compile/ibis_compiler/operations/generic_ops/isnull_op.py create mode 100644 bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py rename bigframes/core/compile/{ => ibis_compiler}/scalar_op_compiler.py (100%) rename bigframes/core/compile/{ => ibis_compiler}/scalar_op_registry.py (99%) create mode 100644 bigframes/core/compile/polars/operations/__init__.py create mode 100644 bigframes/core/compile/polars/operations/generic_ops/__init__.py create mode 100644 bigframes/core/compile/polars/operations/generic_ops/isnull_op.py create mode 100644 bigframes/core/compile/polars/operations/generic_ops/notnull_op.py diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index e2487306ab..68c36df288 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -14,8 +14,8 @@ from __future__ import annotations from bigframes.core.compile.api import test_only_ibis_inferred_schema -from bigframes.core.compile.compiler import compile_sql from bigframes.core.compile.configs import CompileRequest, CompileResult +from bigframes.core.compile.ibis_compiler.ibis_compiler import compile_sql __all__ = [ "test_only_ibis_inferred_schema", diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 0d31798f25..4e0bf477fc 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -27,8 +27,8 @@ import pandas as pd from bigframes.core.compile import constants as compiler_constants +import bigframes.core.compile.ibis_compiler.scalar_op_compiler as scalar_compilers import bigframes.core.compile.ibis_types as compile_ibis_types -import bigframes.core.compile.scalar_op_compiler as scalar_compilers import bigframes.core.expression as ex import bigframes.core.window_spec as window_spec import bigframes.operations.aggregations as agg_ops diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index ddd8622327..3a4695c50d 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -16,7 +16,7 @@ from typing import TYPE_CHECKING from bigframes.core import rewrite -from bigframes.core.compile import compiler +from bigframes.core.compile.ibis_compiler import ibis_compiler if TYPE_CHECKING: import bigframes.core.nodes @@ -26,9 +26,9 @@ def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema.""" import bigframes.core.schema - node = compiler._replace_unsupported_ops(node) + node = ibis_compiler._replace_unsupported_ops(node) node = rewrite.bake_order(node) - ir = compiler.compile_node(node) + ir = ibis_compiler.compile_node(node) items = tuple( bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id)) for name, ibis_id in zip(node.schema.names, ir.column_ids) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 3245d68fdf..5e3f5f0e77 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -32,8 +32,8 @@ from bigframes.core import utils import bigframes.core.compile.aggregate_compiler as agg_compiler import bigframes.core.compile.googlesql +import bigframes.core.compile.ibis_compiler.scalar_op_compiler as op_compilers import bigframes.core.compile.ibis_types -import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.expression as ex from bigframes.core.ordering import OrderingExpression import bigframes.core.sql @@ -47,7 +47,7 @@ # This must be the last import. Currently depending on side-effects. # TODO(tswast): Refactor all ops to register in the same file as where they are # defined so we don't need this. -import bigframes.core.compile.scalar_op_registry # noqa: F401,E402 +import bigframes.core.compile.ibis_compiler.scalar_op_registry # noqa: F401,E402 # Ibis Implementations @@ -684,7 +684,7 @@ def _join_condition( def _as_groupable(value: ibis_types.Value): - from bigframes.core.compile import scalar_op_registry + from bigframes.core.compile.ibis_compiler import scalar_op_registry # Some types need to be converted to another type to enable groupby if value.type().is_float64(): diff --git a/bigframes/core/compile/ibis_compiler/__init__.py b/bigframes/core/compile/ibis_compiler/__init__.py new file mode 100644 index 0000000000..125e161fef --- /dev/null +++ b/bigframes/core/compile/ibis_compiler/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Compiler for BigFrames expression to Ibis expression. + +Make sure to import all ibis_compiler implementations here so that they get +registered. +""" + +from __future__ import annotations + +import bigframes.core.compile.ibis_compiler.operations.generic_ops.isnull_op # noqa: F401 +import bigframes.core.compile.ibis_compiler.operations.generic_ops.notnull_op # noqa: F401 +import bigframes.core.compile.ibis_compiler.scalar_op_registry # noqa: F401 diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py similarity index 99% rename from bigframes/core/compile/compiler.py rename to bigframes/core/compile/ibis_compiler/ibis_compiler.py index 3f5e7ba533..ff0441ea22 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -177,7 +177,7 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args): @_compile_node.register def compile_readtable(node: nodes.ReadTableNode, *args): - from bigframes.core.compile import scalar_op_registry + from bigframes.core.compile.ibis_compiler import scalar_op_registry ibis_table = _table_to_ibis( node.source, scan_cols=[col.source_id for col in node.scan_list.items] diff --git a/bigframes/core/compile/ibis_compiler/operations/__init__.py b/bigframes/core/compile/ibis_compiler/operations/__init__.py new file mode 100644 index 0000000000..cec484f908 --- /dev/null +++ b/bigframes/core/compile/ibis_compiler/operations/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Operation implementations for the Ibis-based compiler. + +This directory structure should reflect the same layout as the +`bigframes/operations` directory where the operations are defined. + +Prefer one file per op to keep file sizes manageable for text editors and LLMs. +""" diff --git a/bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py b/bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/core/compile/ibis_compiler/operations/generic_ops/isnull_op.py b/bigframes/core/compile/ibis_compiler/operations/generic_ops/isnull_op.py new file mode 100644 index 0000000000..75f8c37ab7 --- /dev/null +++ b/bigframes/core/compile/ibis_compiler/operations/generic_ops/isnull_op.py @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from bigframes_vendored.ibis.expr import types as ibis_types + +from bigframes.core.compile.ibis_compiler import scalar_op_compiler +from bigframes.operations.generic_ops import isnull_op + + +@scalar_op_compiler.scalar_op_compiler.register_unary_op(isnull_op.isnull_op) +def _ibis_isnull_op_impl(x: ibis_types.Value): + return x.isnull() diff --git a/bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py b/bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py new file mode 100644 index 0000000000..d773176593 --- /dev/null +++ b/bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py @@ -0,0 +1,52 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +from typing import ClassVar + +# Imports for Ibis compilation +from bigframes_vendored.ibis.expr import types as ibis_types + +# Direct imports from bigframes +from bigframes import dtypes +from bigframes.core.compile.ibis_compiler import scalar_op_compiler +from bigframes.operations import base_ops + + +@dataclasses.dataclass(frozen=True) +class NotNullOp(base_ops.UnaryOp): + name: ClassVar[str] = "notnull" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.BOOL_DTYPE + + +notnull_op = NotNullOp() + + +def _ibis_notnull_op_impl(x: ibis_types.Value): + return x.notnull() + + +scalar_op_compiler.scalar_op_compiler.register_unary_op(notnull_op)( + _ibis_notnull_op_impl +) + + +__all__ = [ + "NotNullOp", + "notnull_op", +] diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/ibis_compiler/scalar_op_compiler.py similarity index 100% rename from bigframes/core/compile/scalar_op_compiler.py rename to bigframes/core/compile/ibis_compiler/scalar_op_compiler.py diff --git a/bigframes/core/compile/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py similarity index 99% rename from bigframes/core/compile/scalar_op_registry.py rename to bigframes/core/compile/ibis_compiler/scalar_op_registry.py index ed8c1103d5..c771e2e3af 100644 --- a/bigframes/core/compile/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -27,10 +27,10 @@ from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS import bigframes.core.compile.default_ordering -import bigframes.core.compile.ibis_types -from bigframes.core.compile.scalar_op_compiler import ( +from bigframes.core.compile.ibis_compiler.scalar_op_compiler import ( scalar_op_compiler, # TODO(tswast): avoid import of variables ) +import bigframes.core.compile.ibis_types import bigframes.operations as ops _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) diff --git a/bigframes/core/compile/polars/__init__.py b/bigframes/core/compile/polars/__init__.py index 8c37e046ab..77ea0cd67a 100644 --- a/bigframes/core/compile/polars/__init__.py +++ b/bigframes/core/compile/polars/__init__.py @@ -11,6 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +"""Compiler for BigFrames expression to Polars LazyFrame expression. + +Make sure to import all polars implementations here so that they get registered. +""" from __future__ import annotations import warnings @@ -19,8 +24,12 @@ import polars # noqa from bigframes.core.compile.polars.compiler import PolarsCompiler + import bigframes.core.compile.polars.operations.generic_ops.isnull_op # noqa: F401 + import bigframes.core.compile.polars.operations.generic_ops.notnull_op # noqa: F401 __all__ = ["PolarsCompiler"] -except Exception: - msg = "Polars compiler not available as polars is not installed." +except Exception as exc: + msg = ( + f"Polars compiler not available as polars is not installed. Details: {str(exc)}" + ) warnings.warn(msg) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index c31c122078..fd0462452b 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -233,14 +233,6 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: else: return input.is_in(op.values) or input.is_null() - @compile_op.register(gen_ops.IsNullOp) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.is_null() - - @compile_op.register(gen_ops.NotNullOp) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.is_not_null() - @compile_op.register(gen_ops.FillNaOp) @compile_op.register(gen_ops.CoalesceOp) def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: diff --git a/bigframes/core/compile/polars/operations/__init__.py b/bigframes/core/compile/polars/operations/__init__.py new file mode 100644 index 0000000000..83d08723fc --- /dev/null +++ b/bigframes/core/compile/polars/operations/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Operation implementations for the Polars LazyFrame compiler. + +This directory structure should reflect the same layout as the +`bigframes/operations` directory where the operations are defined. + +Prefer one file per op to keep file sizes manageable for text editors and LLMs. +""" diff --git a/bigframes/core/compile/polars/operations/generic_ops/__init__.py b/bigframes/core/compile/polars/operations/generic_ops/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/core/compile/polars/operations/generic_ops/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/core/compile/polars/operations/generic_ops/isnull_op.py b/bigframes/core/compile/polars/operations/generic_ops/isnull_op.py new file mode 100644 index 0000000000..e2b2652f75 --- /dev/null +++ b/bigframes/core/compile/polars/operations/generic_ops/isnull_op.py @@ -0,0 +1,39 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import bigframes.core.compile.polars.compiler as polars_compiler +from bigframes.operations.generic_ops import isnull_op + +if TYPE_CHECKING: + import polars as pl + + +def _polars_isnull_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: isnull_op.IsNullOp, + input: pl.Expr, +) -> pl.Expr: + return input.is_null() + + +if hasattr(polars_compiler, "PolarsExpressionCompiler"): + # TODO(https://github.com/python/mypy/issues/13040): remove `type: ignore` + # when mypy can better handle singledispatch. + polars_compiler.PolarsExpressionCompiler.compile_op.register( # type: ignore + isnull_op.IsNullOp, _polars_isnull_op_impl + ) diff --git a/bigframes/core/compile/polars/operations/generic_ops/notnull_op.py b/bigframes/core/compile/polars/operations/generic_ops/notnull_op.py new file mode 100644 index 0000000000..e92bc750e3 --- /dev/null +++ b/bigframes/core/compile/polars/operations/generic_ops/notnull_op.py @@ -0,0 +1,39 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import bigframes.core.compile.polars.compiler as polars_compiler +from bigframes.operations.generic_ops import notnull_op + +if TYPE_CHECKING: + import polars as pl + + +def _polars_notnull_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: notnull_op.NotNullOp, + input: pl.Expr, +) -> pl.Expr: + return input.is_not_null() + + +if hasattr(polars_compiler, "PolarsExpressionCompiler"): + # TODO(https://github.com/python/mypy/issues/13040): remove `type: ignore` + # when mypy can better handle singledispatch. + polars_compiler.PolarsExpressionCompiler.compile_op.register( # type: ignore + notnull_op.NotNullOp, _polars_notnull_op_impl + ) diff --git a/bigframes/operations/generic_ops/__init__.py b/bigframes/operations/generic_ops/__init__.py index a0f6eff272..a8e876da0d 100644 --- a/bigframes/operations/generic_ops/__init__.py +++ b/bigframes/operations/generic_ops/__init__.py @@ -18,8 +18,6 @@ from bigframes import dtypes from bigframes.operations import base_ops -from bigframes.operations.generic_ops.isnull_op import isnull_op, IsNullOp -from bigframes.operations.generic_ops.notnull_op import notnull_op, NotNullOp import bigframes.operations.type as op_typing InvertOp = base_ops.create_unary_op( @@ -155,12 +153,3 @@ class SqlScalarOp(base_ops.NaryOp): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return self._output_type - - -__all__ = [ - "InvertOp", - "IsNullOp", - "isnull_op", - "NotNullOp", - "notnull_op", -] diff --git a/bigframes/operations/generic_ops/isnull_op.py b/bigframes/operations/generic_ops/isnull_op.py index a7548767fb..29c7e82f95 100644 --- a/bigframes/operations/generic_ops/isnull_op.py +++ b/bigframes/operations/generic_ops/isnull_op.py @@ -17,12 +17,7 @@ import dataclasses from typing import ClassVar -# Imports for Ibis compilation -from bigframes_vendored.ibis.expr import types as ibis_types - -# Direct imports from bigframes from bigframes import dtypes -from bigframes.core.compile import scalar_op_compiler from bigframes.operations import base_ops @@ -35,15 +30,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT isnull_op = IsNullOp() - - -def _ibis_isnull_op_impl(x: ibis_types.Value): - return x.isnull() - - -scalar_op_compiler.scalar_op_compiler.register_unary_op(isnull_op)(_ibis_isnull_op_impl) - - __all__ = [ "IsNullOp", "isnull_op", diff --git a/bigframes/operations/generic_ops/notnull_op.py b/bigframes/operations/generic_ops/notnull_op.py index e516965c89..d8a034a1b4 100644 --- a/bigframes/operations/generic_ops/notnull_op.py +++ b/bigframes/operations/generic_ops/notnull_op.py @@ -17,12 +17,7 @@ import dataclasses from typing import ClassVar -# Imports for Ibis compilation -from bigframes_vendored.ibis.expr import types as ibis_types - -# Direct imports from bigframes from bigframes import dtypes -from bigframes.core.compile import scalar_op_compiler from bigframes.operations import base_ops @@ -37,15 +32,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT notnull_op = NotNullOp() -def _ibis_notnull_op_impl(x: ibis_types.Value): - return x.notnull() - - -scalar_op_compiler.scalar_op_compiler.register_unary_op(notnull_op)( - _ibis_notnull_op_impl -) - - __all__ = [ "NotNullOp", "notnull_op", From ec47c370b7e6976f745aab9be46048b9e000e218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 15 Jul 2025 10:27:13 -0500 Subject: [PATCH 07/18] Update bigframes/core/compile/compiled.py --- bigframes/core/compile/compiled.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 5e3f5f0e77..af64e6a1bf 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -44,12 +44,6 @@ op_compiler = op_compilers.scalar_op_compiler -# This must be the last import. Currently depending on side-effects. -# TODO(tswast): Refactor all ops to register in the same file as where they are -# defined so we don't need this. -import bigframes.core.compile.ibis_compiler.scalar_op_registry # noqa: F401,E402 - - # Ibis Implementations class UnorderedIR: def __init__( From b1cf81cc8dc6d45b049b7863a997c4055ad982b7 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 5 Aug 2025 15:03:00 +0000 Subject: [PATCH 08/18] revert unneeded circular import workaround --- bigframes/core/array_value.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 968add8784..b47637cb59 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -37,10 +37,10 @@ from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.exceptions as bfe +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops if typing.TYPE_CHECKING: - # Avoid circular imports. - import bigframes.operations.aggregations as agg_ops from bigframes.session import Session ORDER_ID_COLUMN = "bigframes_ordering_id" @@ -185,8 +185,6 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" - import bigframes.operations.aggregations as agg_ops # Avoid circular imports. - return ArrayValue( nodes.AggregateNode( child=self.node, @@ -202,8 +200,6 @@ def row_count(self) -> ArrayValue: # Operations def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - import bigframes.operations as ops # Avoid circular imports. - predicate: ex.Expression = ex.deref(predicate_id) if keep_null: predicate = ops.fillna_op.as_expr(predicate, ex.const(True)) From 87fd788a064130f6b27e04ece90712fc005b1434 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 16:48:33 +0000 Subject: [PATCH 09/18] combine null ops into generic_ops files --- bigframes/core/compile/compiled.py | 2 +- .../core/compile/ibis_compiler/__init__.py | 3 +- .../{ => ibis_compiler}/aggregate_compiler.py | 0 .../isnull_op.py => generic_ops.py} | 19 +++++-- .../operations/generic_ops/__init__.py | 13 ----- .../operations/generic_ops/notnull_op.py | 52 ------------------- bigframes/core/compile/polars/__init__.py | 7 ++- bigframes/core/compile/polars/compiler.py | 23 +++++++- .../notnull_op.py => generic_ops.py} | 26 ++++++---- .../polars/operations/generic_ops/__init__.py | 13 ----- .../operations/generic_ops/isnull_op.py | 39 -------------- bigframes/operations/__init__.py | 4 +- .../__init__.py => generic_ops.py} | 16 ++++++ bigframes/operations/generic_ops/isnull_op.py | 36 ------------- .../operations/generic_ops/notnull_op.py | 38 -------------- 15 files changed, 80 insertions(+), 211 deletions(-) rename bigframes/core/compile/{ => ibis_compiler}/aggregate_compiler.py (100%) rename bigframes/core/compile/ibis_compiler/operations/{generic_ops/isnull_op.py => generic_ops.py} (61%) delete mode 100644 bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py delete mode 100644 bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py rename bigframes/core/compile/polars/operations/{generic_ops/notnull_op.py => generic_ops.py} (61%) delete mode 100644 bigframes/core/compile/polars/operations/generic_ops/__init__.py delete mode 100644 bigframes/core/compile/polars/operations/generic_ops/isnull_op.py rename bigframes/operations/{generic_ops/__init__.py => generic_ops.py} (92%) delete mode 100644 bigframes/operations/generic_ops/isnull_op.py delete mode 100644 bigframes/operations/generic_ops/notnull_op.py diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index af64e6a1bf..27660d2ba7 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -30,8 +30,8 @@ import pyarrow as pa from bigframes.core import utils -import bigframes.core.compile.aggregate_compiler as agg_compiler import bigframes.core.compile.googlesql +import bigframes.core.compile.ibis_compiler.aggregate_compiler as agg_compiler import bigframes.core.compile.ibis_compiler.scalar_op_compiler as op_compilers import bigframes.core.compile.ibis_types import bigframes.core.expression as ex diff --git a/bigframes/core/compile/ibis_compiler/__init__.py b/bigframes/core/compile/ibis_compiler/__init__.py index 125e161fef..aef0ed9267 100644 --- a/bigframes/core/compile/ibis_compiler/__init__.py +++ b/bigframes/core/compile/ibis_compiler/__init__.py @@ -20,6 +20,5 @@ from __future__ import annotations -import bigframes.core.compile.ibis_compiler.operations.generic_ops.isnull_op # noqa: F401 -import bigframes.core.compile.ibis_compiler.operations.generic_ops.notnull_op # noqa: F401 +import bigframes.core.compile.ibis_compiler.operations.generic_ops # noqa: F401 import bigframes.core.compile.ibis_compiler.scalar_op_registry # noqa: F401 diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/ibis_compiler/aggregate_compiler.py similarity index 100% rename from bigframes/core/compile/aggregate_compiler.py rename to bigframes/core/compile/ibis_compiler/aggregate_compiler.py diff --git a/bigframes/core/compile/ibis_compiler/operations/generic_ops/isnull_op.py b/bigframes/core/compile/ibis_compiler/operations/generic_ops.py similarity index 61% rename from bigframes/core/compile/ibis_compiler/operations/generic_ops/isnull_op.py rename to bigframes/core/compile/ibis_compiler/operations/generic_ops.py index 75f8c37ab7..78f6a0c4de 100644 --- a/bigframes/core/compile/ibis_compiler/operations/generic_ops/isnull_op.py +++ b/bigframes/core/compile/ibis_compiler/operations/generic_ops.py @@ -12,14 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +BigFrames -> Ibis compilation for the operations in bigframes.operations.generic_ops. + +Please keep implementations in sequential order by op name. +""" + from __future__ import annotations from bigframes_vendored.ibis.expr import types as ibis_types from bigframes.core.compile.ibis_compiler import scalar_op_compiler -from bigframes.operations.generic_ops import isnull_op +from bigframes.operations import generic_ops + +register_unary_op = scalar_op_compiler.scalar_op_compiler.register_unary_op + + +@register_unary_op(generic_ops.notnull_op) +def notnull_op_impl(x: ibis_types.Value): + return x.notnull() -@scalar_op_compiler.scalar_op_compiler.register_unary_op(isnull_op.isnull_op) -def _ibis_isnull_op_impl(x: ibis_types.Value): +@register_unary_op(generic_ops.isnull_op) +def isnull_op_impl(x: ibis_types.Value): return x.isnull() diff --git a/bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py b/bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py deleted file mode 100644 index 0a2669d7a2..0000000000 --- a/bigframes/core/compile/ibis_compiler/operations/generic_ops/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py b/bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py deleted file mode 100644 index d773176593..0000000000 --- a/bigframes/core/compile/ibis_compiler/operations/generic_ops/notnull_op.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses -from typing import ClassVar - -# Imports for Ibis compilation -from bigframes_vendored.ibis.expr import types as ibis_types - -# Direct imports from bigframes -from bigframes import dtypes -from bigframes.core.compile.ibis_compiler import scalar_op_compiler -from bigframes.operations import base_ops - - -@dataclasses.dataclass(frozen=True) -class NotNullOp(base_ops.UnaryOp): - name: ClassVar[str] = "notnull" - - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return dtypes.BOOL_DTYPE - - -notnull_op = NotNullOp() - - -def _ibis_notnull_op_impl(x: ibis_types.Value): - return x.notnull() - - -scalar_op_compiler.scalar_op_compiler.register_unary_op(notnull_op)( - _ibis_notnull_op_impl -) - - -__all__ = [ - "NotNullOp", - "notnull_op", -] diff --git a/bigframes/core/compile/polars/__init__.py b/bigframes/core/compile/polars/__init__.py index 77ea0cd67a..fac1ef1d77 100644 --- a/bigframes/core/compile/polars/__init__.py +++ b/bigframes/core/compile/polars/__init__.py @@ -20,12 +20,15 @@ import warnings +# The ops imports appear first so that the implementations can be registered. +# polars shouldn't be needed at import time, as register is a no-op if polars +# isn't installed. +import bigframes.core.compile.polars.operations.generic_ops # noqa: F401 + try: import polars # noqa from bigframes.core.compile.polars.compiler import PolarsCompiler - import bigframes.core.compile.polars.operations.generic_ops.isnull_op # noqa: F401 - import bigframes.core.compile.polars.operations.generic_ops.notnull_op # noqa: F401 __all__ = ["PolarsCompiler"] except Exception as exc: diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index ab62570342..87ed11b946 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -17,7 +17,7 @@ import functools import itertools import operator -from typing import cast, Literal, Optional, Sequence, Tuple, TYPE_CHECKING +from typing import cast, Literal, Optional, Sequence, Tuple, Type, TYPE_CHECKING import pandas as pd @@ -46,6 +46,27 @@ except Exception: polars_installed = False + +def register_op(op: Type): + """Register a compilation from BigFrames to Ibis. + + This decorator can be used, even if Polars is not installed. + + Args: + op: The type of the operator the wrapped function compiles. + """ + + def decorator(func): + if polars_installed: + # Ignore the type because compile_op is a generic Callable, so + # register isn't available according to mypy. + return PolarsExpressionCompiler.compile_op.register(op)(func) # type: ignore + else: + return func + + return decorator + + if polars_installed: _DTYPE_MAPPING = { # Direct mappings diff --git a/bigframes/core/compile/polars/operations/generic_ops/notnull_op.py b/bigframes/core/compile/polars/operations/generic_ops.py similarity index 61% rename from bigframes/core/compile/polars/operations/generic_ops/notnull_op.py rename to bigframes/core/compile/polars/operations/generic_ops.py index e92bc750e3..de0e987aa2 100644 --- a/bigframes/core/compile/polars/operations/generic_ops/notnull_op.py +++ b/bigframes/core/compile/polars/operations/generic_ops.py @@ -12,28 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +BigFrames -> Polars compilation for the operations in bigframes.operations.generic_ops. + +Please keep implementations in sequential order by op name. +""" + from __future__ import annotations from typing import TYPE_CHECKING import bigframes.core.compile.polars.compiler as polars_compiler -from bigframes.operations.generic_ops import notnull_op +from bigframes.operations import generic_ops if TYPE_CHECKING: import polars as pl -def _polars_notnull_op_impl( +@polars_compiler.register_op(generic_ops.NotNullOp) +def notnull_op_impl( compiler: polars_compiler.PolarsExpressionCompiler, - op: notnull_op.NotNullOp, + op: generic_ops.NotNullOp, # type: ignore input: pl.Expr, ) -> pl.Expr: return input.is_not_null() -if hasattr(polars_compiler, "PolarsExpressionCompiler"): - # TODO(https://github.com/python/mypy/issues/13040): remove `type: ignore` - # when mypy can better handle singledispatch. - polars_compiler.PolarsExpressionCompiler.compile_op.register( # type: ignore - notnull_op.NotNullOp, _polars_notnull_op_impl - ) +@polars_compiler.register_op(generic_ops.IsNullOp) +def isnull_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: generic_ops.IsNullOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + return input.is_null() diff --git a/bigframes/core/compile/polars/operations/generic_ops/__init__.py b/bigframes/core/compile/polars/operations/generic_ops/__init__.py deleted file mode 100644 index 0a2669d7a2..0000000000 --- a/bigframes/core/compile/polars/operations/generic_ops/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/bigframes/core/compile/polars/operations/generic_ops/isnull_op.py b/bigframes/core/compile/polars/operations/generic_ops/isnull_op.py deleted file mode 100644 index e2b2652f75..0000000000 --- a/bigframes/core/compile/polars/operations/generic_ops/isnull_op.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import bigframes.core.compile.polars.compiler as polars_compiler -from bigframes.operations.generic_ops import isnull_op - -if TYPE_CHECKING: - import polars as pl - - -def _polars_isnull_op_impl( - compiler: polars_compiler.PolarsExpressionCompiler, - op: isnull_op.IsNullOp, - input: pl.Expr, -) -> pl.Expr: - return input.is_null() - - -if hasattr(polars_compiler, "PolarsExpressionCompiler"): - # TODO(https://github.com/python/mypy/issues/13040): remove `type: ignore` - # when mypy can better handle singledispatch. - polars_compiler.PolarsExpressionCompiler.compile_op.register( # type: ignore - isnull_op.IsNullOp, _polars_isnull_op_impl - ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index abf3ed26fc..86098d47cf 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -81,15 +81,15 @@ hash_op, invert_op, IsInOp, + isnull_op, MapOp, maximum_op, minimum_op, + notnull_op, RowKey, SqlScalarOp, where_op, ) -from bigframes.operations.generic_ops.isnull_op import isnull_op -from bigframes.operations.generic_ops.notnull_op import notnull_op from bigframes.operations.geo_ops import ( geo_area_op, geo_st_astext_op, diff --git a/bigframes/operations/generic_ops/__init__.py b/bigframes/operations/generic_ops.py similarity index 92% rename from bigframes/operations/generic_ops/__init__.py rename to bigframes/operations/generic_ops.py index a8e876da0d..3c3f9653b4 100644 --- a/bigframes/operations/generic_ops/__init__.py +++ b/bigframes/operations/generic_ops.py @@ -29,6 +29,22 @@ ) invert_op = InvertOp() +IsNullOp = base_ops.create_unary_op( + name="isnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +isnull_op = IsNullOp() + +NotNullOp = base_ops.create_unary_op( + name="notnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +notnull_op = NotNullOp() + HashOp = base_ops.create_unary_op( name="hash", type_signature=op_typing.FixedOutputType( diff --git a/bigframes/operations/generic_ops/isnull_op.py b/bigframes/operations/generic_ops/isnull_op.py deleted file mode 100644 index 29c7e82f95..0000000000 --- a/bigframes/operations/generic_ops/isnull_op.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses -from typing import ClassVar - -from bigframes import dtypes -from bigframes.operations import base_ops - - -@dataclasses.dataclass(frozen=True) -class IsNullOp(base_ops.UnaryOp): - name: ClassVar[str] = "isnull" - - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return dtypes.BOOL_DTYPE - - -isnull_op = IsNullOp() -__all__ = [ - "IsNullOp", - "isnull_op", -] diff --git a/bigframes/operations/generic_ops/notnull_op.py b/bigframes/operations/generic_ops/notnull_op.py deleted file mode 100644 index d8a034a1b4..0000000000 --- a/bigframes/operations/generic_ops/notnull_op.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses -from typing import ClassVar - -from bigframes import dtypes -from bigframes.operations import base_ops - - -@dataclasses.dataclass(frozen=True) -class NotNullOp(base_ops.UnaryOp): - name: ClassVar[str] = "notnull" - - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return dtypes.BOOL_DTYPE - - -notnull_op = NotNullOp() - - -__all__ = [ - "NotNullOp", - "notnull_op", -] From 6372a236a2b38bdccb508882e0bda4d743f06d26 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 16:52:04 +0000 Subject: [PATCH 10/18] revert expression change --- bigframes/core/expression.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index ce31a740e6..7b20e430ff 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -19,18 +19,15 @@ import functools import itertools import typing -from typing import Callable, Generator, Mapping, TYPE_CHECKING, TypeVar, Union +from typing import Callable, Generator, Mapping, TypeVar, Union import pandas as pd from bigframes import dtypes from bigframes.core import field import bigframes.core.identifiers as ids - -if TYPE_CHECKING: - # Avoid circular imports. - import bigframes.operations - import bigframes.operations.aggregations as agg_ops +import bigframes.operations +import bigframes.operations.aggregations as agg_ops def const( From ef06d658fdfd73790612062b887dab0df0216885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 6 Aug 2025 12:28:22 -0500 Subject: [PATCH 11/18] Update bigframes/core/compile/polars/operations/__init__.py --- bigframes/core/compile/polars/operations/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/polars/operations/__init__.py b/bigframes/core/compile/polars/operations/__init__.py index 83d08723fc..26444dcb67 100644 --- a/bigframes/core/compile/polars/operations/__init__.py +++ b/bigframes/core/compile/polars/operations/__init__.py @@ -17,5 +17,5 @@ This directory structure should reflect the same layout as the `bigframes/operations` directory where the operations are defined. -Prefer one file per op to keep file sizes manageable for text editors and LLMs. +Prefer small groups of ops per file to keep file sizes manageable for text editors and LLMs. """ From 7c19d8b192cf90925ee968bfed7d71549cba53a8 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 17:45:37 +0000 Subject: [PATCH 12/18] skip polars test for old polars --- bigframes/core/compile/polars/__init__.py | 10 ++++++---- bigframes/core/compile/polars/compiler.py | 6 +++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bigframes/core/compile/polars/__init__.py b/bigframes/core/compile/polars/__init__.py index fac1ef1d77..7ae6fcc755 100644 --- a/bigframes/core/compile/polars/__init__.py +++ b/bigframes/core/compile/polars/__init__.py @@ -26,13 +26,15 @@ import bigframes.core.compile.polars.operations.generic_ops # noqa: F401 try: - import polars # noqa + import bigframes._importing + + # Use import_polars() instead of importing directly so that we check the + # version numbers. + bigframes._importing.import_polars() from bigframes.core.compile.polars.compiler import PolarsCompiler __all__ = ["PolarsCompiler"] except Exception as exc: - msg = ( - f"Polars compiler not available as polars is not installed. Details: {str(exc)}" - ) + msg = f"Polars compiler not available as there was an exception importing polars. Details: {str(exc)}" warnings.warn(msg) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 87ed11b946..f3088d69a8 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -42,7 +42,11 @@ import polars as pl else: try: - import polars as pl + import bigframes._importing + + # Use import_polars() instead of importing directly so that we check + # the version numbers. + pl = bigframes._importing.import_polars() except Exception: polars_installed = False From 7babc87455c6678932aa33ac1a36016d53680c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 6 Aug 2025 12:48:06 -0500 Subject: [PATCH 13/18] Update bigframes/core/compile/ibis_compiler/operations/__init__.py --- bigframes/core/compile/ibis_compiler/operations/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/ibis_compiler/operations/__init__.py b/bigframes/core/compile/ibis_compiler/operations/__init__.py index cec484f908..9d9f3849ab 100644 --- a/bigframes/core/compile/ibis_compiler/operations/__init__.py +++ b/bigframes/core/compile/ibis_compiler/operations/__init__.py @@ -17,5 +17,5 @@ This directory structure should reflect the same layout as the `bigframes/operations` directory where the operations are defined. -Prefer one file per op to keep file sizes manageable for text editors and LLMs. +Prefer a few ops per file to keep file sizes manageable for text editors and LLMs. """ From aa5c47de8cacc441c721405ba9b17d7d263daca3 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 17:55:02 +0000 Subject: [PATCH 14/18] add minversion to skips --- tests/unit/test_dataframe_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index 79f2049da8..32e051babf 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -38,7 +38,7 @@ convert_pandas_dtypes, ) -pytest.importorskip("polars") +pytest.importorskip("polars", minversion="1.7.0") pytest.importorskip("pandas", minversion="2.0.0") CURRENT_DIR = pathlib.Path(__file__).parent From 0c9802de08a4caac0dfbfbb5ba0f12181fa1f095 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 18:02:47 +0000 Subject: [PATCH 15/18] more skips --- tests/system/small/test_polars_execution.py | 2 +- tests/unit/test_local_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_polars_execution.py b/tests/system/small/test_polars_execution.py index 1568a76ec9..0b5a74f32c 100644 --- a/tests/system/small/test_polars_execution.py +++ b/tests/system/small/test_polars_execution.py @@ -16,7 +16,7 @@ import bigframes from bigframes.testing.utils import assert_pandas_df_equal -polars = pytest.importorskip("polars", reason="polars is required for this test") +polars = pytest.importorskip("polars", minversion="1.7.0") @pytest.fixture(scope="module") diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index 509bc6ade2..d78b8d774a 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -20,7 +20,7 @@ import bigframes import bigframes.pandas as bpd -pytest.importorskip("polars") +pytest.importorskip("polars", minversion="1.7.0") pytest.importorskip("pandas", minversion="2.0.0") From c2f1ca8d8cac9d3456cd45eca85de8c394a9d51b Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 19:19:48 +0000 Subject: [PATCH 16/18] fix minimum polars version detection --- bigframes/_importing.py | 11 +- noxfile.py | 26 +-- setup.py | 4 +- testing/constraints-3.10.txt | 5 +- testing/constraints-3.11.txt | 175 ++++++++++++++++++++ tests/system/small/test_polars_execution.py | 2 +- tests/unit/test_dataframe_polars.py | 2 +- tests/unit/test_local_engine.py | 2 +- 8 files changed, 207 insertions(+), 20 deletions(-) diff --git a/bigframes/_importing.py b/bigframes/_importing.py index 095a1d9c51..e88bd77fe8 100644 --- a/bigframes/_importing.py +++ b/bigframes/_importing.py @@ -14,6 +14,7 @@ import importlib from types import ModuleType +import numpy from packaging import version # Keep this in sync with setup.py @@ -22,9 +23,13 @@ def import_polars() -> ModuleType: polars_module = importlib.import_module("polars") - imported_version = version.Version(polars_module.build_info()["version"]) - if imported_version < POLARS_MIN_VERSION: + # Check for necessary methods instead of the version number because we + # can't trust the polars version until + # https://github.com/pola-rs/polars/issues/23940 is fixed. + try: + polars_module.lit(numpy.int64(100), dtype=polars_module.Int64()) + except TypeError: raise ImportError( - f"Imported polars version: {imported_version} is below the minimum version: {POLARS_MIN_VERSION}" + f"Imported polars version is likely below the minimum version: {POLARS_MIN_VERSION}" ) return polars_module diff --git a/noxfile.py b/noxfile.py index 2d0edfc1b0..573a439abd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -78,15 +78,20 @@ ] UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] -UNIT_TEST_EXTRAS: List[str] = ["tests", "anywidget"] +UNIT_TEST_EXTRAS: List[str] = ["tests"] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.12": ["tests", "polars", "scikit-learn", "anywidget"], + "3.10": ["tests", "polars", "scikit-learn", "anywidget"], + "3.11": ["tests", "polars", "scikit-learn", "anywidget"], + # Make sure we leave some versions without "extras" so we know those + # dependencies are actually optional. + "3.13": ["tests", "polars", "scikit-learn", "anywidget"], } +# 3.11 is used by colab. # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search # bigframes/windows-docker, internally. -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.12", "3.13"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.13"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", @@ -105,12 +110,13 @@ ] SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = [] SYSTEM_TEST_DEPENDENCIES: List[str] = [] -SYSTEM_TEST_EXTRAS: List[str] = [] +SYSTEM_TEST_EXTRAS: List[str] = ["tests"] SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.9": ["tests", "anywidget"], - "3.10": ["tests", "polars"], - "3.12": ["tests", "scikit-learn", "polars", "anywidget"], - "3.13": ["tests", "polars"], + # Make sure we leave some versions without "extras" so we know those + # dependencies are actually optional. + "3.10": ["scikit-learn", "polars", "anywidget"], + "3.11": ["scikit-learn", "polars", "anywidget"], + "3.13": ["polars", "anywidget"], } LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -120,8 +126,8 @@ # Sessions are executed in the order so putting the smaller sessions # ahead to fail fast at presubmit running. nox.options.sessions = [ - "system-3.9", - "system-3.12", + "system-3.9", # No extras. + "system-3.11", "cover", # TODO(b/401609005): remove "cleanup", diff --git a/setup.py b/setup.py index bc42cc4281..2aef514749 100644 --- a/setup.py +++ b/setup.py @@ -76,8 +76,8 @@ "google-cloud-bigtable >=2.24.0", "google-cloud-pubsub >=2.21.4", ], - # used for local engine, which is only needed for unit tests at present. - "polars": ["polars >= 1.7.0"], + # used for local engine + "polars": ["polars >= 1.21.0"], "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. "dev": [ diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index 12ad443aab..1695a4806b 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -1,4 +1,5 @@ -# Keep in sync with colab/containers/requirements.core.in image +# When we drop Python 3.9, +# please keep these in sync with the minimum versions in setup.py google-auth==2.27.0 ipykernel==5.5.6 ipython==7.34.0 @@ -15,4 +16,4 @@ matplotlib==3.7.1 psutil==5.9.5 seaborn==0.13.1 traitlets==5.7.1 -polars==1.7.0 +polars==1.21.0 diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt index e69de29bb2..5e837332fc 100644 --- a/testing/constraints-3.11.txt +++ b/testing/constraints-3.11.txt @@ -0,0 +1,175 @@ +# Keep in sync with colab/containers/requirements.core.in image. +google-auth==2.38.0 +ipykernel==6.17.1 +ipython==7.34.0 +jupyter-server==1.16.0 +ipyparallel==8.8.0 +pandas==2.2.2 +portpicker==1.5.2 +requests==2.32.3 +tornado==6.4.2 +absl-py==1.4.0 +debugpy==1.8.15 +ipywidgets==7.7.1 +matplotlib==3.10.0 +psutil==5.9.5 +seaborn==0.13.2 +traitlets==5.7.1 +# Keep in sync with colab/containers/requirements.in image. +Flask==3.1.0 +PyDrive2==1.21.1 +PyYAML==6.0 +Sphinx==8.2.3 +albumentations==2.0.0 +albucore==0.0.24 +arviz==0.22.0 +astropy==7.0 +autograd==1.7 +bigquery-magics==0.10.2 +bottleneck==1.4.0 +bokeh==3.7.2 +cmake==3.31.2 +community==1.0.0b1 +cufflinks==0.17.0 +cuda-python==12.6.0 +cuml-cu12==25.6.0 +cupy-cuda12x==13.3.0 +cvxopt==1.3.0 +cvxpy==1.6.0 +cyipopt==1.5.0 +cython==3.0.11 +dask==2025.5.0 +dataproc-spark-connect==0.8.1 +datasets==4.0 +diffusers==0.33 +dlib==19.24.6 +dopamine-rl==4.1.0 +duckdb==1.3.0 +earthengine-api==1.5.0 +easydict==1.13 +editdistance==0.8.0 +einops==0.8.0 +fastai==2.7.19 +firebase-admin==6.7 +folium==0.20 +future==1.0.0 +gcsfs==2025.3 +geemap==0.35.1 +geopandas==1.1 +geopy==2.4.0 +glob2==0.7 +google-api-core==2.24 +google-api-python-client==2.167 +google-auth-httplib2==0.2.0 +google-cloud-aiplatform==1.85 +google-cloud-bigquery==3.31 +google-cloud-bigquery-connection==1.18 +google-cloud-core==2.4 +google-cloud-datastore==2.20 +google-cloud-firestore==2.20 +google-cloud-functions==1.20 +google-cloud-language==2.17 +google-cloud-translate==3.20 +google-generativeai==0.8 +googledrivedownloader==1.1.0 +gradio==5.37 +graphviz==0.20 +gym==0.25.0 +h5py==3.13 +hdbscan==0.8 +highspy==1.10 +holidays==0.66 +holoviews==1.20 +html5lib==1.1 +httpimport==1.4 +humanize==4.12 +hyperopt==0.2 +ibis-framework==9.5.0 +imageio==2.37 +imbalanced-learn==0.13 +imutils==0.5 +inflect==7.5 +ipyleaflet==0.19 +ipython-sql==0.5 +jieba==0.42 +jinja2==3.1 +jsonschema==4.23 +jupytext==1.17.1 +keyrings.google-artifactregistry-auth==1.1.2 +langchain==0.3.20 +libpysal==4.13.0 +lightgbm==4.6.0 +lxml==5.4 +matplotlib-venn==1.1 +missingno==0.5 +mizani==0.13.0 +mkl==2025.2.0 +mlxtend==0.23 +more-itertools==10.7 +moviepy==1.0.3 +music21==9.3.0 +natsort==8.4 +nbconvert==7.16 +nibabel==5.3 +numba==0.60.0 +oauth2client==4.0 +openai==1.94 +opencv-contrib-python==4.12 +opencv-python==4.12 +openpyxl==3.1 +pandas-datareader==0.10 +pandas-gbq==0.28 +peft==0.15 +pillow==11.3.0 +plotnine==0.14.0 +polars==1.25.2 +progressbar2==4.5.0 +prophet==1.1.6 +protobuf==5.29.5 +psycopg2==2.9.10 +pyarrow==18.1.0 +pycocotools==2.0.8 +pydotplus==2.0.2 +pydot==3.0.0 +pygit2==1.17 +pygments==2.18 +pygobject==3.42 +pymc==5.24 +pyomo==6.8 +pyopengl==3.1.7 +pyspark==3.5 +python-dateutil==2.8 +python-louvain==0.16 +pytz==2025.1 +regex==2024.11.6 +sentence-transformers==5.0 +shap==0.47 +spanner-graph-notebook==1.1 +sqlalchemy==2.0.36 +statsmodels==0.14.0 +sympy==1.13.0 +tables==3.10.1 +tabulate==0.9.0 +tensorboard==2.19.0 +tensorflow==2.19.0 +tensorflow-decision-forests==1.12.0 +tensorflow-hub==0.16.1 +tf-keras==2.19.0 +keras==3.10.0 +keras-hub==0.21.1 +keras-nlp==0.21.1 +textblob==0.19.0 +timm==1.0.17 +torchao==0.10.0 +torchsummary==1.5.1 +torchtune==0.6.1 +tsfresh==0.21.0 +tweepy==4.16 +umap-learn==0.5.0 +wandb==0.21 +wordcloud==1.9.0 +xarray==2025.7 +xgboost==3.0 +xlrd==2.0.1 +yellowbrick==1.5 +yfinance==0.2 diff --git a/tests/system/small/test_polars_execution.py b/tests/system/small/test_polars_execution.py index 0b5a74f32c..916780b1ce 100644 --- a/tests/system/small/test_polars_execution.py +++ b/tests/system/small/test_polars_execution.py @@ -16,7 +16,7 @@ import bigframes from bigframes.testing.utils import assert_pandas_df_equal -polars = pytest.importorskip("polars", minversion="1.7.0") +polars = pytest.importorskip("polars") @pytest.fixture(scope="module") diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index 32e051babf..79f2049da8 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -38,7 +38,7 @@ convert_pandas_dtypes, ) -pytest.importorskip("polars", minversion="1.7.0") +pytest.importorskip("polars") pytest.importorskip("pandas", minversion="2.0.0") CURRENT_DIR = pathlib.Path(__file__).parent diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index d78b8d774a..509bc6ade2 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -20,7 +20,7 @@ import bigframes import bigframes.pandas as bpd -pytest.importorskip("polars", minversion="1.7.0") +pytest.importorskip("polars") pytest.importorskip("pandas", minversion="2.0.0") From 863b8ed7c8f332630dc473bb57d94565661ea1ac Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 19:24:17 +0000 Subject: [PATCH 17/18] update colab constraints --- testing/constraints-3.11.txt | 710 ++++++++++++++++++++++++++++------- 1 file changed, 578 insertions(+), 132 deletions(-) diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt index 5e837332fc..8fd20d453b 100644 --- a/testing/constraints-3.11.txt +++ b/testing/constraints-3.11.txt @@ -1,175 +1,621 @@ -# Keep in sync with colab/containers/requirements.core.in image. -google-auth==2.38.0 -ipykernel==6.17.1 -ipython==7.34.0 -jupyter-server==1.16.0 -ipyparallel==8.8.0 -pandas==2.2.2 -portpicker==1.5.2 -requests==2.32.3 -tornado==6.4.2 +# Keep in sync with %pip freeze in colab. +# Note: These are just constraints, so it's ok to have extra packages we +# aren't installing, except in the version that gets used for prerelease +# tests. absl-py==1.4.0 -debugpy==1.8.15 -ipywidgets==7.7.1 -matplotlib==3.10.0 -psutil==5.9.5 -seaborn==0.13.2 -traitlets==5.7.1 -# Keep in sync with colab/containers/requirements.in image. -Flask==3.1.0 -PyDrive2==1.21.1 -PyYAML==6.0 -Sphinx==8.2.3 -albumentations==2.0.0 +accelerate==1.9.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +alabaster==1.0.0 albucore==0.0.24 +albumentations==2.0.8 +ale-py==0.11.2 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +anyio==4.10.0 +anywidget==0.9.18 +argon2-cffi==25.1.0 +argon2-cffi-bindings==25.1.0 +array_record==0.7.2 arviz==0.22.0 -astropy==7.0 -autograd==1.7 +astropy==7.1.0 +astropy-iers-data==0.2025.8.4.0.42.59 +astunparse==1.6.3 +atpublic==5.1 +attrs==25.3.0 +audioread==3.0.1 +autograd==1.8.0 +babel==2.17.0 +backcall==0.2.0 +backports.tarfile==1.2.0 +beautifulsoup4==4.13.4 +betterproto==2.0.0b6 bigquery-magics==0.10.2 -bottleneck==1.4.0 -bokeh==3.7.2 -cmake==3.31.2 +bleach==6.2.0 +blinker==1.9.0 +blis==1.3.0 +blobfile==3.0.0 +blosc2==3.6.1 +bokeh==3.7.3 +Bottleneck==1.4.2 +bqplot==0.12.45 +branca==0.8.1 +Brotli==1.1.0 +build==1.3.0 +CacheControl==0.14.3 +cachetools==5.5.2 +catalogue==2.0.10 +certifi==2025.8.3 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.2 +chex==0.1.90 +clarabel==0.11.1 +click==8.2.1 +cloudpathlib==0.21.1 +cloudpickle==3.1.1 +cmake==3.31.6 +cmdstanpy==1.2.5 +colorcet==3.1.0 +colorlover==0.3.0 +colour==0.1.5 community==1.0.0b1 -cufflinks==0.17.0 -cuda-python==12.6.0 +confection==0.1.5 +cons==0.4.7 +contourpy==1.3.3 +cramjam==2.11.0 +cryptography==43.0.3 +cuda-python==12.6.2.post1 +cudf-polars-cu12==25.6.0 +cufflinks==0.17.3 cuml-cu12==25.6.0 cupy-cuda12x==13.3.0 -cvxopt==1.3.0 -cvxpy==1.6.0 +curl_cffi==0.12.0 +cuvs-cu12==25.6.1 +cvxopt==1.3.2 +cvxpy==1.6.7 +cycler==0.12.1 cyipopt==1.5.0 -cython==3.0.11 +cymem==2.0.11 +Cython==3.0.12 dask==2025.5.0 -dataproc-spark-connect==0.8.1 -datasets==4.0 -diffusers==0.33 +dask-cuda==25.6.0 +dask-cudf-cu12==25.6.0 +dataproc-spark-connect==0.8.3 +datasets==4.0.0 +db-dtypes==1.4.3 +dbus-python==1.2.18 +debugpy==1.8.15 +decorator==4.4.2 +defusedxml==0.7.1 +diffusers==0.34.0 +dill==0.3.8 +distributed==2025.5.0 +distributed-ucxx-cu12==0.44.0 +distro==1.9.0 dlib==19.24.6 -dopamine-rl==4.1.0 -duckdb==1.3.0 -earthengine-api==1.5.0 +dm-tree==0.1.9 +docstring_parser==0.17.0 +docutils==0.21.2 +dopamine_rl==4.1.2 +duckdb==1.3.2 +earthengine-api==1.5.24 easydict==1.13 -editdistance==0.8.0 -einops==0.8.0 +editdistance==0.8.1 +eerepr==0.1.2 +einops==0.8.1 +entrypoints==0.4 +et_xmlfile==2.0.0 +etils==1.13.0 +etuples==0.3.10 +Farama-Notifications==0.0.4 fastai==2.7.19 -firebase-admin==6.7 -folium==0.20 +fastapi==0.116.1 +fastcore==1.7.29 +fastdownload==0.0.7 +fastjsonschema==2.21.1 +fastprogress==1.0.3 +fastrlock==0.8.3 +ffmpy==0.6.1 +filelock==3.18.0 +firebase-admin==6.9.0 +Flask==3.1.1 +flatbuffers==25.2.10 +flax==0.10.6 +folium==0.20.0 +fonttools==4.59.0 +frozendict==2.4.6 +frozenlist==1.7.0 +fsspec==2025.3.0 future==1.0.0 -gcsfs==2025.3 -geemap==0.35.1 -geopandas==1.1 -geopy==2.4.0 +gast==0.6.0 +gcsfs==2025.3.0 +GDAL==3.8.4 +gdown==5.2.0 +geemap==0.35.3 +geocoder==1.38.1 +geographiclib==2.0 +geopandas==1.1.1 +geopy==2.4.1 +gin-config==0.5.0 +gitdb==4.0.12 +GitPython==3.1.45 glob2==0.7 -google-api-core==2.24 -google-api-python-client==2.167 +google==2.0.3 +google-ai-generativelanguage==0.6.15 +google-api-core==2.25.1 +google-api-python-client==2.177.0 +google-auth==2.38.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.85 -google-cloud-bigquery==3.31 -google-cloud-bigquery-connection==1.18 -google-cloud-core==2.4 -google-cloud-datastore==2.20 -google-cloud-firestore==2.20 -google-cloud-functions==1.20 -google-cloud-language==2.17 -google-cloud-translate==3.20 -google-generativeai==0.8 +google-auth-oauthlib==1.2.2 +google-cloud-aiplatform==1.106.0 +google-cloud-bigquery==3.35.1 +google-cloud-bigquery-connection==1.18.3 +google-cloud-bigquery-storage==2.32.0 +google-cloud-core==2.4.3 +google-cloud-dataproc==5.21.0 +google-cloud-datastore==2.21.0 +google-cloud-firestore==2.21.0 +google-cloud-functions==1.20.4 +google-cloud-language==2.17.2 +google-cloud-resource-manager==1.14.2 +google-cloud-spanner==3.56.0 +google-cloud-storage==2.19.0 +google-cloud-translate==3.21.1 +google-crc32c==1.7.1 +google-genai==1.28.0 +google-generativeai==0.8.5 +google-pasta==0.2.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.70.0 googledrivedownloader==1.1.0 -gradio==5.37 -graphviz==0.20 -gym==0.25.0 -h5py==3.13 -hdbscan==0.8 -highspy==1.10 -holidays==0.66 -holoviews==1.20 +gradio==5.39.0 +gradio_client==1.11.0 +graphviz==0.21 +greenlet==3.2.3 +groovy==0.1.2 +grpc-google-iam-v1==0.14.2 +grpc-interceptor==0.15.4 +grpcio==1.74.0 +grpcio-status==1.71.2 +grpclib==0.4.8 +gspread==6.2.1 +gspread-dataframe==4.0.0 +gym==0.25.2 +gym-notices==0.1.0 +gymnasium==1.2.0 +h11==0.16.0 +h2==4.2.0 +h5netcdf==1.6.3 +h5py==3.14.0 +hdbscan==0.8.40 +hf-xet==1.1.5 +hf_transfer==0.1.9 +highspy==1.11.0 +holidays==0.78 +holoviews==1.21.0 +hpack==4.1.0 html5lib==1.1 -httpimport==1.4 -humanize==4.12 -hyperopt==0.2 +httpcore==1.0.9 +httpimport==1.4.1 +httplib2==0.22.0 +httpx==0.28.1 +huggingface-hub==0.34.3 +humanize==4.12.3 +hyperframe==6.1.0 +hyperopt==0.2.7 ibis-framework==9.5.0 -imageio==2.37 -imbalanced-learn==0.13 -imutils==0.5 -inflect==7.5 -ipyleaflet==0.19 -ipython-sql==0.5 -jieba==0.42 -jinja2==3.1 -jsonschema==4.23 -jupytext==1.17.1 +idna==3.10 +imageio==2.37.0 +imageio-ffmpeg==0.6.0 +imagesize==1.4.1 +imbalanced-learn==0.13.0 +immutabledict==4.2.1 +importlib_metadata==8.7.0 +importlib_resources==6.5.2 +imutils==0.5.4 +inflect==7.5.0 +iniconfig==2.1.0 +intel-cmplr-lib-ur==2025.2.0 +intel-openmp==2025.2.0 +ipyevents==2.0.2 +ipyfilechooser==0.6.0 +ipykernel==6.17.1 +ipyleaflet==0.20.0 +ipyparallel==8.8.0 +ipython==7.34.0 +ipython-genutils==0.2.0 +ipython-sql==0.5.0 +ipytree==0.2.2 +ipywidgets==7.7.1 +itsdangerous==2.2.0 +jaraco.classes==3.4.0 +jaraco.context==6.0.1 +jaraco.functools==4.2.1 +jax==0.5.3 +jax-cuda12-pjrt==0.5.3 +jax-cuda12-plugin==0.5.3 +jaxlib==0.5.3 +jeepney==0.9.0 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.10.0 +joblib==1.5.1 +jsonpatch==1.33 +jsonpickle==4.1.1 +jsonpointer==3.0.0 +jsonschema==4.25.0 +jsonschema-specifications==2025.4.1 +jupyter-client==6.1.12 +jupyter-console==6.1.0 +jupyter-leaflet==0.20.0 +jupyter-server==1.16.0 +jupyter_core==5.8.1 +jupyterlab_pygments==0.3.0 +jupyterlab_widgets==3.0.15 +jupytext==1.17.2 +kaggle==1.7.4.5 +kagglehub==0.3.12 +keras==3.10.0 +keras-hub==0.21.1 +keras-nlp==0.21.1 +keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 -langchain==0.3.20 +kiwisolver==1.4.8 +langchain==0.3.27 +langchain-core==0.3.72 +langchain-text-splitters==0.3.9 +langcodes==3.5.0 +langsmith==0.4.10 +language_data==1.3.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +lazy_loader==0.4 +libclang==18.1.1 +libcugraph-cu12==25.6.0 +libcuml-cu12==25.6.0 +libcuvs-cu12==25.6.1 +libkvikio-cu12==25.6.0 libpysal==4.13.0 -lightgbm==4.6.0 -lxml==5.4 -matplotlib-venn==1.1 -missingno==0.5 -mizani==0.13.0 +libraft-cu12==25.6.0 +librmm-cu12==25.6.0 +librosa==0.11.0 +libucx-cu12==1.18.1 +libucxx-cu12==0.44.0 +linkify-it-py==2.0.3 +llvmlite==0.43.0 +locket==1.0.0 +logical-unification==0.4.6 +lxml==5.4.0 +Mako==1.1.3 +marisa-trie==1.2.1 +Markdown==3.8.2 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +matplotlib==3.10.0 +matplotlib-inline==0.1.7 +matplotlib-venn==1.1.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +miniKanren==1.0.5 +missingno==0.5.2 +mistune==3.1.3 +mizani==0.13.5 mkl==2025.2.0 -mlxtend==0.23 -more-itertools==10.7 +ml_dtypes==0.5.3 +mlxtend==0.23.4 +more-itertools==10.7.0 moviepy==1.0.3 +mpmath==1.3.0 +msgpack==1.1.1 +multidict==6.6.3 +multipledispatch==1.0.0 +multiprocess==0.70.16 +multitasking==0.0.12 +murmurhash==1.0.13 music21==9.3.0 -natsort==8.4 -nbconvert==7.16 -nibabel==5.3 +namex==0.1.0 +narwhals==2.0.1 +natsort==8.4.0 +nbclassic==1.3.1 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +ndindex==1.10.0 +nest-asyncio==1.6.0 +networkx==3.5 +nibabel==5.3.2 +nltk==3.9.1 +notebook==6.5.7 +notebook_shim==0.2.4 numba==0.60.0 -oauth2client==4.0 -openai==1.94 -opencv-contrib-python==4.12 -opencv-python==4.12 -openpyxl==3.1 -pandas-datareader==0.10 -pandas-gbq==0.28 -peft==0.15 +numba-cuda==0.11.0 +numexpr==2.11.0 +numpy==2.0.2 +nvidia-cublas-cu12==12.5.3.2 +nvidia-cuda-cupti-cu12==12.5.82 +nvidia-cuda-nvcc-cu12==12.5.82 +nvidia-cuda-nvrtc-cu12==12.5.82 +nvidia-cuda-runtime-cu12==12.5.82 +nvidia-cudnn-cu12==9.3.0.75 +nvidia-cufft-cu12==11.2.3.61 +nvidia-curand-cu12==10.3.6.82 +nvidia-cusolver-cu12==11.6.3.83 +nvidia-cusparse-cu12==12.5.1.3 +nvidia-cusparselt-cu12==0.6.2 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.23.4 +nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvtx-cu12==12.4.127 +nvtx==0.2.13 +oauth2client==4.1.3 +oauthlib==3.3.1 +omegaconf==2.3.0 +openai==1.98.0 +opencv-contrib-python==4.12.0.88 +opencv-python==4.12.0.88 +opencv-python-headless==4.12.0.88 +openpyxl==3.1.5 +opt_einsum==3.4.0 +optax==0.2.5 +optree==0.17.0 +orbax-checkpoint==0.11.20 +orjson==3.11.1 +osqp==1.0.4 +packaging==25.0 +pandas==2.2.2 +pandas-datareader==0.10.0 +pandas-gbq==0.29.2 +pandas-stubs==2.2.2.240909 +pandocfilters==1.5.1 +panel==1.7.5 +param==2.2.1 +parso==0.8.4 +parsy==2.1 +partd==1.4.2 +patsy==1.0.1 +peewee==3.18.2 +peft==0.17.0 +pexpect==4.9.0 +pickleshare==0.7.5 pillow==11.3.0 -plotnine==0.14.0 +platformdirs==4.3.8 +plotly==5.24.1 +plotnine==0.14.5 +pluggy==1.6.0 +ply==3.11 polars==1.25.2 +pooch==1.8.2 +portpicker==1.5.2 +preshed==3.0.10 +prettytable==3.16.0 +proglog==0.1.12 progressbar2==4.5.0 -prophet==1.1.6 +prometheus_client==0.22.1 +promise==2.3 +prompt_toolkit==3.0.51 +propcache==0.3.2 +prophet==1.1.7 +proto-plus==1.26.1 protobuf==5.29.5 +psutil==5.9.5 psycopg2==2.9.10 +psygnal==0.14.0 +ptyprocess==0.7.0 +py-cpuinfo==9.0.0 +py4j==0.10.9.7 pyarrow==18.1.0 -pycocotools==2.0.8 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pycairo==1.28.0 +pycocotools==2.0.10 +pycparser==2.22 +pycryptodomex==3.23.0 +pydantic==2.11.7 +pydantic_core==2.33.2 +pydata-google-auth==1.9.1 +pydot==3.0.4 pydotplus==2.0.2 -pydot==3.0.0 -pygit2==1.17 -pygments==2.18 -pygobject==3.42 -pymc==5.24 -pyomo==6.8 -pyopengl==3.1.7 -pyspark==3.5 -python-dateutil==2.8 +PyDrive2==1.21.3 +pydub==0.25.1 +pyerfa==2.0.1.5 +pygame==2.6.1 +pygit2==1.18.1 +Pygments==2.19.2 +PyGObject==3.42.0 +PyJWT==2.10.1 +pylibcugraph-cu12==25.6.0 +pylibraft-cu12==25.6.0 +pymc==5.25.1 +pynndescent==0.5.13 +pynvjitlink-cu12==0.7.0 +pynvml==12.0.0 +pyogrio==0.11.1 +pyomo==6.9.2 +PyOpenGL==3.1.9 +pyOpenSSL==24.2.1 +pyparsing==3.2.3 +pyperclip==1.9.0 +pyproj==3.7.1 +pyproject_hooks==1.2.0 +pyshp==2.3.1 +PySocks==1.7.1 +pyspark==3.5.1 +pytensor==2.31.7 +python-apt==0.0.0 +python-box==7.3.2 +python-dateutil==2.9.0.post0 python-louvain==0.16 -pytz==2025.1 +python-multipart==0.0.20 +python-slugify==8.0.4 +python-snappy==0.7.3 +python-utils==3.9.1 +pytz==2025.2 +pyviz_comms==3.0.6 +PyWavelets==1.9.0 +PyYAML==6.0.2 +pyzmq==26.2.1 +raft-dask-cu12==25.6.0 +rapids-dask-dependency==25.6.0 +rapids-logger==0.1.1 +ratelim==0.1.6 +referencing==0.36.2 regex==2024.11.6 -sentence-transformers==5.0 -shap==0.47 -spanner-graph-notebook==1.1 -sqlalchemy==2.0.36 -statsmodels==0.14.0 -sympy==1.13.0 -tables==3.10.1 +requests==2.32.3 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +requirements-parser==0.9.0 +rich==13.9.4 +rmm-cu12==25.6.0 +roman-numerals-py==3.1.0 +rpds-py==0.26.0 +rpy2==3.5.17 +rsa==4.9.1 +ruff==0.12.7 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-image==0.25.2 +scikit-learn==1.6.1 +scipy==1.16.1 +scooby==0.10.1 +scs==3.2.7.post2 +seaborn==0.13.2 +SecretStorage==3.3.3 +semantic-version==2.10.0 +Send2Trash==1.8.3 +sentence-transformers==4.1.0 +sentencepiece==0.2.0 +sentry-sdk==2.34.1 +shap==0.48.0 +shapely==2.1.1 +shellingham==1.5.4 +simple-parsing==0.1.7 +simplejson==3.20.1 +simsimd==6.5.0 +six==1.17.0 +sklearn-compat==0.1.3 +sklearn-pandas==2.2.0 +slicer==0.0.8 +smart_open==7.3.0.post1 +smmap==5.0.2 +sniffio==1.3.1 +snowballstemmer==3.0.1 +sortedcontainers==2.4.0 +soundfile==0.13.1 +soupsieve==2.7 +soxr==0.5.0.post1 +spacy==3.8.7 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +spanner-graph-notebook==1.1.7 +Sphinx==8.2.3 +sphinxcontrib-applehelp==2.0.0 +sphinxcontrib-devhelp==2.0.0 +sphinxcontrib-htmlhelp==2.1.0 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==2.0.0 +sphinxcontrib-serializinghtml==2.0.0 +SQLAlchemy==2.0.42 +sqlglot==25.20.2 +sqlparse==0.5.3 +srsly==2.5.1 +stanio==0.5.1 +starlette==0.47.2 +statsmodels==0.14.5 +stringzilla==3.12.5 +stumpy==1.13.0 +sympy==1.13.1 +tables==3.10.2 tabulate==0.9.0 +tbb==2022.2.0 +tblib==3.1.0 +tcmlib==1.4.0 +tenacity==8.5.0 tensorboard==2.19.0 +tensorboard-data-server==0.7.2 tensorflow==2.19.0 -tensorflow-decision-forests==1.12.0 +tensorflow-datasets==4.9.9 tensorflow-hub==0.16.1 -tf-keras==2.19.0 -keras==3.10.0 -keras-hub==0.21.1 -keras-nlp==0.21.1 +tensorflow-io-gcs-filesystem==0.37.1 +tensorflow-metadata==1.17.2 +tensorflow-probability==0.25.0 +tensorflow-text==2.19.0 +tensorflow_decision_forests==1.12.0 +tensorstore==0.1.76 +termcolor==3.1.0 +terminado==0.18.1 +text-unidecode==1.3 textblob==0.19.0 -timm==1.0.17 +tf-slim==1.1.0 +tf_keras==2.19.0 +thinc==8.3.6 +threadpoolctl==3.6.0 +tifffile==2025.6.11 +tiktoken==0.9.0 +timm==1.0.19 +tinycss2==1.4.0 +tokenizers==0.21.4 +toml==0.10.2 +tomlkit==0.13.3 +toolz==0.12.1 torchao==0.10.0 +torchdata==0.11.0 torchsummary==1.5.1 torchtune==0.6.1 +tornado==6.4.2 +tqdm==4.67.1 +traitlets==5.7.1 +traittypes==0.2.1 +transformers==4.54.1 +treelite==4.4.1 +treescope==0.1.9 +triton==3.2.0 tsfresh==0.21.0 -tweepy==4.16 -umap-learn==0.5.0 -wandb==0.21 -wordcloud==1.9.0 -xarray==2025.7 -xgboost==3.0 -xlrd==2.0.1 +tweepy==4.16.0 +typeguard==4.4.4 +typer==0.16.0 +types-pytz==2025.2.0.20250516 +types-setuptools==80.9.0.20250801 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +tzdata==2025.2 +tzlocal==5.3.1 +uc-micro-py==1.0.3 +ucx-py-cu12==0.44.0 +ucxx-cu12==0.44.0 +umap-learn==0.5.9.post2 +umf==0.11.0 +uritemplate==4.2.0 +urllib3==2.5.0 +uvicorn==0.35.0 +vega-datasets==0.9.0 +wadllib==1.3.6 +wandb==0.21.0 +wasabi==1.1.3 +wcwidth==0.2.13 +weasel==0.4.1 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +widgetsnbextension==3.6.10 +wordcloud==1.9.4 +wrapt==1.17.2 +wurlitzer==3.1.1 +xarray==2025.7.1 +xarray-einstats==0.9.1 +xgboost==3.0.3 +xlrd==2.0.2 +xxhash==3.5.0 +xyzservices==2025.4.0 +yarl==1.20.1 +ydf==0.13.0 yellowbrick==1.5 -yfinance==0.2 +yfinance==0.2.65 +zict==3.0.0 +zipp==3.23.0 From f42800e1d65132476ab3df0d3844098868b76d4f Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Wed, 6 Aug 2025 19:47:22 +0000 Subject: [PATCH 18/18] skip polars on 3.10 --- noxfile.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/noxfile.py b/noxfile.py index 573a439abd..7adf499a08 100644 --- a/noxfile.py +++ b/noxfile.py @@ -80,7 +80,7 @@ UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = ["tests"] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.10": ["tests", "polars", "scikit-learn", "anywidget"], + "3.10": ["tests", "scikit-learn", "anywidget"], "3.11": ["tests", "polars", "scikit-learn", "anywidget"], # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. @@ -114,9 +114,9 @@ SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { # Make sure we leave some versions without "extras" so we know those # dependencies are actually optional. - "3.10": ["scikit-learn", "polars", "anywidget"], - "3.11": ["scikit-learn", "polars", "anywidget"], - "3.13": ["polars", "anywidget"], + "3.10": ["tests", "scikit-learn", "anywidget"], + "3.11": ["tests", "scikit-learn", "polars", "anywidget"], + "3.13": ["tests", "polars", "anywidget"], } LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"