Merge branch 'main' into sycai_ai_gen_bool

sycai · web-flow · commit c9151dc82727 · 2025-09-11T14:05:37.000-07:00
diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py
@@ -1025,8 +1025,6 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
                 x, ibis_dtypes.string, safe=op.safe
             )
             return parse_json_in_safe(x_str) if op.safe else parse_json(x_str)
-        if x.type().is_struct():
-            return to_json_string(typing.cast(ibis_types.StructValue, x))
 
     if x.type() == ibis_dtypes.json:
         if to_type == ibis_dtypes.int64:
@@ -2097,7 +2095,7 @@ def json_extract_string_array(  # type: ignore[empty-body]
 
 @ibis_udf.scalar.builtin(name="to_json_string")
 def to_json_string(  # type: ignore[empty-body]
-    json_obj,
+    json_obj: ibis_dtypes.JSON,
 ) -> ibis_dtypes.String:
     """Convert JSON to STRING."""
 
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -300,7 +300,15 @@ def remap_vars(
     def remap_refs(
         self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId]
     ) -> InNode:
-        return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True))  # type: ignore
+        return dataclasses.replace(
+            self,
+            left_col=self.left_col.remap_column_refs(
+                mappings, allow_partial_bindings=True
+            ),
+            right_col=self.right_col.remap_column_refs(
+                mappings, allow_partial_bindings=True
+            ),
+        )  # type: ignore
 
 
 @dataclasses.dataclass(frozen=True, eq=False)
diff --git a/bigframes/core/rewrite/identifiers.py b/bigframes/core/rewrite/identifiers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import annotations
 
+import dataclasses
 import typing
 
 from bigframes.core import identifiers, nodes
@@ -26,32 +27,68 @@ def remap_variables(
     nodes.BigFrameNode,
     dict[identifiers.ColumnId, identifiers.ColumnId],
 ]:
-    """Remaps `ColumnId`s in the BFET to produce deterministic and sequential UIDs.
+    """Remaps `ColumnId`s in the expression tree to be deterministic and sequential.
 
-    Note: this will convert a DAG to a tree.
+    This function performs a post-order traversal. It recursively remaps children
+    nodes first, then remaps the current node's references and definitions.
+
+    Note: this will convert a DAG to a tree by duplicating shared nodes.
+
+    Args:
+        root: The root node of the expression tree.
+        id_generator: An iterator that yields new column IDs.
+
+    Returns:
+        A tuple of the new root node and a mapping from old to new column IDs
+        visible to the parent node.
     """
-    child_replacement_map = dict()
-    ref_mapping = dict()
-    # Sequential ids are assigned bottom-up left-to-right
+    # Step 1: Recursively remap children to get their new nodes and ID mappings.
+    new_child_nodes: list[nodes.BigFrameNode] = []
+    new_child_mappings: list[dict[identifiers.ColumnId, identifiers.ColumnId]] = []
     for child in root.child_nodes:
-        new_child, child_var_mapping = remap_variables(child, id_generator=id_generator)
-        child_replacement_map[child] = new_child
-        ref_mapping.update(child_var_mapping)
-
-    # This is actually invalid until we've replaced all of children, refs and var defs
-    with_new_children = root.transform_children(
-        lambda node: child_replacement_map[node]
-    )
-
-    with_new_refs = with_new_children.remap_refs(ref_mapping)
-
-    node_var_mapping = {old_id: next(id_generator) for old_id in root.node_defined_ids}
-    with_new_vars = with_new_refs.remap_vars(node_var_mapping)
-    with_new_vars._validate()
-
-    return (
-        with_new_vars,
-        node_var_mapping
-        if root.defines_namespace
-        else (ref_mapping | node_var_mapping),
-    )
+        new_child, child_mappings = remap_variables(child, id_generator=id_generator)
+        new_child_nodes.append(new_child)
+        new_child_mappings.append(child_mappings)
+
+    # Step 2: Transform children to use their new nodes.
+    remapped_children: dict[nodes.BigFrameNode, nodes.BigFrameNode] = {
+        child: new_child for child, new_child in zip(root.child_nodes, new_child_nodes)
+    }
+    new_root = root.transform_children(lambda node: remapped_children[node])
+
+    # Step 3: Transform the current node using the mappings from its children.
+    downstream_mappings: dict[identifiers.ColumnId, identifiers.ColumnId] = {
+        k: v for mapping in new_child_mappings for k, v in mapping.items()
+    }
+    if isinstance(new_root, nodes.InNode):
+        new_root = typing.cast(nodes.InNode, new_root)
+        new_root = dataclasses.replace(
+            new_root,
+            left_col=new_root.left_col.remap_column_refs(
+                new_child_mappings[0], allow_partial_bindings=True
+            ),
+            right_col=new_root.right_col.remap_column_refs(
+                new_child_mappings[1], allow_partial_bindings=True
+            ),
+        )
+    else:
+        new_root = new_root.remap_refs(downstream_mappings)
+
+    # Step 4: Create new IDs for columns defined by the current node.
+    node_defined_mappings = {
+        old_id: next(id_generator) for old_id in root.node_defined_ids
+    }
+    new_root = new_root.remap_vars(node_defined_mappings)
+
+    new_root._validate()
+
+    # Step 5: Determine which mappings to propagate up to the parent.
+    if root.defines_namespace:
+        # If a node defines a new namespace (e.g., a join), mappings from its
+        # children are not visible to its parents.
+        mappings_for_parent = node_defined_mappings
+    else:
+        # Otherwise, pass up the combined mappings from children and the current node.
+        mappings_for_parent = downstream_mappings | node_defined_mappings
+
+    return new_root, mappings_for_parent
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -641,9 +641,6 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
         return BIGFRAMES_STRING_TO_BIGFRAMES[
             typing.cast(DtypeString, str(dtype_string))
         ]
-    if isinstance(dtype_string, str) and dtype_string.lower() == "json":
-        return JSON_DTYPE
-
     raise TypeError(
         textwrap.dedent(
             f"""
@@ -655,9 +652,9 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
                         The following pandas.ExtensionDtype are supported:
                         pandas.BooleanDtype(), pandas.Float64Dtype(),
                         pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
-                        pandas.ArrowDtype(pa.date32()), pandas.ArrowDtype(pa.time64("us")),
-                        pandas.ArrowDtype(pa.timestamp("us")),
-                        pandas.ArrowDtype(pa.timestamp("us", tz="UTC")).
+                        pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
+                        pd.ArrowDtype(pa.timestamp("us")),
+                        pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
                 {constants.FEEDBACK_LINK}
                 """
         )
diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py
@@ -324,8 +324,6 @@ def _valid_cast(src: dtypes.Dtype, dst: dtypes.Dtype):
             if not _valid_cast(src_dtype, dst_dtype):
                 return False
         return True
-    if dtypes.is_struct_like(src) and dst == dtypes.JSON_DTYPE:
-        return True
 
     return _valid_scalar_cast(src, dst)
 
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 from collections import namedtuple
-from datetime import datetime
+from datetime import date, datetime
 import inspect
 import sys
 import typing
@@ -198,7 +198,7 @@ def to_datetime(
 
 @typing.overload
 def to_datetime(
-    arg: Union[int, float, str, datetime],
+    arg: Union[int, float, str, datetime, date],
     *,
     utc: bool = False,
     format: Optional[str] = None,
@@ -209,7 +209,7 @@ def to_datetime(
 
 def to_datetime(
     arg: Union[
-        Union[int, float, str, datetime],
+        Union[int, float, str, datetime, date],
         vendored_pandas_datetimes.local_iterables,
         bigframes.series.Series,
         bigframes.dataframe.DataFrame,
diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py
@@ -15,13 +15,16 @@
 """Module for bigquery continuous queries"""
 from __future__ import annotations
 
+from abc import abstractmethod
+from datetime import date, datetime
 import functools
 import inspect
 import json
-from typing import Optional
+from typing import Optional, Union
 import warnings
 
 from google.cloud import bigquery
+import pandas as pd
 
 from bigframes import dataframe
 from bigframes.core import log_adapter, nodes
@@ -54,9 +57,14 @@ def _curate_df_doc(doc: Optional[str]):
 
 
 class StreamingBase:
-    _appends_sql: str
     _session: bigframes.session.Session
 
+    @abstractmethod
+    def _appends_sql(
+        self, start_timestamp: Optional[Union[int, float, str, datetime, date]]
+    ) -> str:
+        pass
+
     def to_bigtable(
         self,
         *,
@@ -70,6 +78,8 @@ def to_bigtable(
         bigtable_options: Optional[dict] = None,
         job_id: Optional[str] = None,
         job_id_prefix: Optional[str] = None,
+        start_timestamp: Optional[Union[int, float, str, datetime, date]] = None,
+        end_timestamp: Optional[Union[int, float, str, datetime, date]] = None,
     ) -> bigquery.QueryJob:
         """
         Export the StreamingDataFrame as a continue job and returns a
@@ -115,16 +125,24 @@ def to_bigtable(
                 If specified, a job id prefix for the query, see
                 job_id_prefix parameter of
                 https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query
-
+            start_timestamp (int, float, str, datetime, date, default None):
+                The starting timestamp for the query. Possible values are to 7 days in the past. If don't specify a timestamp (None), the query will default to the earliest possible time, 7 days ago. If provide a time-zone-naive timestamp, it will be treated as UTC.
         Returns:
             google.cloud.bigquery.QueryJob:
                 See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob
                 The ongoing query job can be managed using this object.
                 For example, the job can be cancelled or its error status
                 can be examined.
         """
+        if not isinstance(
+            start_timestamp, (int, float, str, datetime, date, type(None))
+        ):
+            raise ValueError(
+                f"Unsupported start_timestamp type {type(start_timestamp)}"
+            )
+
         return _to_bigtable(
-            self._appends_sql,
+            self._appends_sql(start_timestamp),
             instance=instance,
             table=table,
             service_account_email=service_account_email,
@@ -145,6 +163,7 @@ def to_pubsub(
         service_account_email: str,
         job_id: Optional[str] = None,
         job_id_prefix: Optional[str] = None,
+        start_timestamp: Optional[Union[int, float, str, datetime, date]] = None,
     ) -> bigquery.QueryJob:
         """
         Export the StreamingDataFrame as a continue job and returns a
@@ -172,6 +191,8 @@ def to_pubsub(
                 If specified, a job id prefix for the query, see
                 job_id_prefix parameter of
                 https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query
+            start_timestamp (int, float, str, datetime, date, default None):
+                The starting timestamp for the query. Possible values are to 7 days in the past. If don't specify a timestamp (None), the query will default to the earliest possible time, 7 days ago. If provide a time-zone-naive timestamp, it will be treated as UTC.
 
         Returns:
             google.cloud.bigquery.QueryJob:
@@ -180,8 +201,15 @@ def to_pubsub(
                 For example, the job can be cancelled or its error status
                 can be examined.
         """
+        if not isinstance(
+            start_timestamp, (int, float, str, datetime, date, type(None))
+        ):
+            raise ValueError(
+                f"Unsupported start_timestamp type {type(start_timestamp)}"
+            )
+
         return _to_pubsub(
-            self._appends_sql,
+            self._appends_sql(start_timestamp),
             topic=topic,
             service_account_email=service_account_email,
             session=self._session,
@@ -280,14 +308,21 @@ def sql(self):
     sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql))
 
     # Patch for the required APPENDS clause
-    @property
-    def _appends_sql(self):
+    def _appends_sql(
+        self, start_timestamp: Optional[Union[int, float, str, datetime, date]]
+    ) -> str:
         sql_str = self.sql
         original_table = self._original_table
         assert original_table is not None
 
         # TODO(b/405691193): set start time back to NULL. Now set it slightly after 7 days max interval to avoid the bug.
-        appends_clause = f"APPENDS(TABLE `{original_table}`, CURRENT_TIMESTAMP() - (INTERVAL 7 DAY - INTERVAL 5 MINUTE))"
+        start_ts_str = (
+            str(f"TIMESTAMP('{pd.to_datetime(start_timestamp)}')")
+            if start_timestamp
+            else "CURRENT_TIMESTAMP() - (INTERVAL 7 DAY - INTERVAL 5 MINUTE)"
+        )
+
+        appends_clause = f"APPENDS(TABLE `{original_table}`, {start_ts_str})"
         sql_str = sql_str.replace(f"`{original_table}`", appends_clause)
         return sql_str
 
diff --git a/tests/system/large/streaming/test_bigtable.py b/tests/system/large/streaming/test_bigtable.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from datetime import datetime, timedelta
 import time
 from typing import Generator
 import uuid
@@ -91,11 +92,12 @@ def test_streaming_df_to_bigtable(
             bigtable_options={},
             job_id=None,
             job_id_prefix=job_id_prefix,
+            start_timestamp=datetime.now() - timedelta(days=1),
         )
 
-        # wait 100 seconds in order to ensure the query doesn't stop
+        # wait 200 seconds in order to ensure the query doesn't stop
         # (i.e. it is continuous)
-        time.sleep(100)
+        time.sleep(200)
         assert query_job.running()
         assert query_job.error_result is None
         assert str(query_job.job_id).startswith(job_id_prefix)
diff --git a/tests/system/large/streaming/test_pubsub.py b/tests/system/large/streaming/test_pubsub.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from concurrent import futures
+from datetime import datetime, timedelta
 from typing import Generator
 import uuid
 
@@ -99,11 +100,12 @@ def callback(message):
             service_account_email="streaming-testing@bigframes-load-testing.iam.gserviceaccount.com",
             job_id=None,
             job_id_prefix=job_id_prefix,
+            start_timestamp=datetime.now() - timedelta(days=1),
         )
         try:
-            # wait 100 seconds in order to ensure the query doesn't stop
+            # wait 200 seconds in order to ensure the query doesn't stop
             # (i.e. it is continuous)
-            future.result(timeout=100)
+            future.result(timeout=200)
         except futures.TimeoutError:
             future.cancel()
         assert query_job.running()
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
diff --git a/tests/unit/core/rewrite/conftest.py b/tests/unit/core/rewrite/conftest.py
diff --git a/tests/unit/core/rewrite/test_identifiers.py b/tests/unit/core/rewrite/test_identifiers.py