feat: Add str accessor to index

TrevorBergeron · TrevorBergeron · commit f0556190e87f · 2025-10-17T19:51:36.000Z
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -43,6 +43,7 @@
 
 if typing.TYPE_CHECKING:
     import bigframes.dataframe
+    import bigframes.operations.strings
     import bigframes.series
 
 
@@ -254,6 +255,12 @@ def query_job(self) -> bigquery.QueryJob:
             self._query_job = query_job
         return self._query_job
 
+    @property
+    def str(self) -> bigframes.operations.strings.StringMethods:
+        import bigframes.operations.strings
+
+        return bigframes.operations.strings.StringMethods(self)
+
     def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
         """Get integer location, slice or boolean mask for requested label.
 
@@ -317,7 +324,9 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
             result_series = bigframes.series.Series(mask_block)
             return result_series.astype("boolean")
 
-    def _get_monotonic_slice(self, filtered_block, offsets_id: str) -> slice:
+    def _get_monotonic_slice(
+        self, filtered_block, offsets_id: __builtins__.str
+    ) -> slice:
         """Helper method to get a slice for monotonic duplicates with an optimized query."""
         # Combine min and max aggregations into a single query for efficiency
         min_max_aggs = [
@@ -343,7 +352,7 @@ def _get_monotonic_slice(self, filtered_block, offsets_id: str) -> slice:
         # Create slice (stop is exclusive)
         return slice(min_pos, max_pos + 1)
 
-    def __repr__(self) -> str:
+    def __repr__(self) -> __builtins__.str:
         # Protect against errors with uninitialized Series. See:
         # https://github.com/googleapis/python-bigquery-dataframes/issues/728
         if not hasattr(self, "_block"):
@@ -417,7 +426,7 @@ def sort_values(
         *,
         inplace: bool = False,
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: __builtins__.str = "last",
     ) -> Index:
         if na_position not in ["first", "last"]:
             raise ValueError("Param na_position must be one of 'first' or 'last'")
@@ -604,7 +613,7 @@ def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index:
         result = block_ops.dropna(self._block, self._block.index_columns, how=how)
         return Index(result)
 
-    def drop_duplicates(self, *, keep: str = "first") -> Index:
+    def drop_duplicates(self, *, keep: __builtins__.str = "first") -> Index:
         if keep is not False:
             validations.enforce_ordered(self, "drop_duplicates")
         block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
@@ -656,6 +665,9 @@ def __contains__(self, key) -> bool:
         block, match_col = self._block.project_expr(match_expr_final)
         return cast(bool, block.get_stat(match_col, agg_ops.AnyOp()))
 
+    def _apply_unary_op(self, op: ops.UnaryOp) -> Index:
+        return self._apply_unary_expr(op.as_expr(ex.free_var("input")))
+
     def _apply_unary_expr(
         self,
         op: ex.Expression,
@@ -762,9 +774,15 @@ def item(self):
         return self.to_series().peek(2).item()
 
     def __eq__(self, other) -> Index:  # type: ignore
-        return self._apply_binop(other, ops.eq_op)
+        return self._apply_binary_op(other, ops.eq_op)
 
-    def _apply_binop(self, other, op: ops.BinaryOp) -> Index:
+    def _apply_binary_op(
+        self,
+        other,
+        op: ops.BinaryOp,
+        alignment: typing.Literal["outer", "left"] = "outer",
+    ) -> Index:
+        # Note: alignment arg is for compatibility with accessors, is ignored as irrelevant for implicit joins.
         # TODO: Handle local objects, or objects not implicitly alignable? Gets ambiguous with partial ordering though
         if isinstance(other, (bigframes.series.Series, Index)):
             other = Index(other)
@@ -785,12 +803,13 @@ def _apply_binop(self, other, op: ops.BinaryOp) -> Index:
                     for lid, rid in zip(lexpr.column_ids, rexpr.column_ids)
                 ]
             )
+            labels = self.names if self.names == other.names else [None] * len(res_ids)
             return Index(
                 blocks.Block(
                     expr.select_columns(res_ids),
                     index_columns=res_ids,
                     column_labels=[],
-                    index_labels=[None] * len(res_ids),
+                    index_labels=labels,
                 )
             )
         elif (
@@ -799,7 +818,7 @@ def _apply_binop(self, other, op: ops.BinaryOp) -> Index:
             block, id = self._block.project_expr(
                 op.as_expr(self._block.index_columns[0], ex.const(other))
             )
-            return Index(block.select_column(id))
+            return Index(block.set_index([id], index_labels=self.names))
         elif isinstance(other, tuple) and len(other) == self.nlevels:
             block = self._block.project_exprs(
                 [
@@ -809,7 +828,7 @@ def _apply_binop(self, other, op: ops.BinaryOp) -> Index:
                 labels=[None] * self.nlevels,
                 drop=True,
             )
-            return Index(block.set_index(block.value_columns))
+            return Index(block.set_index(block.value_columns, index_labels=self.names))
         else:
             return NotImplemented
 
diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py
@@ -60,7 +60,7 @@ def __eq__(self, other) -> Index:  # type: ignore
         import bigframes.operations as ops
         import bigframes.operations.aggregations as agg_ops
 
-        eq_result = self._apply_binop(other, ops.eq_op)._block.expr
+        eq_result = self._apply_binary_op(other, ops.eq_op)._block.expr
 
         as_array = ops.ToArrayOp().as_expr(
             *(
diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import re
-from typing import Literal, Optional, Union
+from typing import Literal, Optional, TYPE_CHECKING, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.strings.accessor as vendorstr
@@ -25,7 +25,10 @@
 import bigframes.operations as ops
 from bigframes.operations._op_converters import convert_index, convert_slice
 import bigframes.operations.aggregations as agg_ops
-import bigframes.series as series
+
+if TYPE_CHECKING:
+    import bigframes.core.indexes.base as indices
+    import bigframes.series as series
 
 # Maps from python to re2
 REGEXP_FLAGS = {
@@ -39,7 +42,7 @@
 class StringMethods(vendorstr.StringMethods):
     __doc__ = vendorstr.StringMethods.__doc__
 
-    def __init__(self, data: series.Series):
+    def __init__(self, data: Union[series.Series, indices.Index]):
         self._data = data
 
     def __getitem__(self, key: Union[int, slice]) -> series.Series:
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
@@ -685,3 +685,23 @@ def test_index_eq_aligned_index(scalars_df_index, scalars_pandas_df_index):
         scalars_pandas_df_index.int64_col.abs()
     )
     assert bf_result == pd.Index(pd_result)
+
+
+def test_index_str_accessor_unary(scalars_df_index, scalars_pandas_df_index):
+    bf_index = scalars_df_index.set_index("string_col").index
+    pd_index = scalars_pandas_df_index.set_index("string_col").index
+
+    bf_result = bf_index.str.pad(30, side="both", fillchar="~").to_pandas()
+    pd_result = pd_index.str.pad(30, side="both", fillchar="~")
+
+    pd.testing.assert_index_equal(bf_result, pd_result)
+
+
+def test_index_str_accessor_binary(scalars_df_index, scalars_pandas_df_index):
+    bf_index = scalars_df_index.set_index("string_col").index
+    pd_index = scalars_pandas_df_index.set_index("string_col").index
+
+    bf_result = bf_index.str.cat(bf_index.str[:4]).to_pandas()
+    pd_result = pd_index.str.cat(pd_index.str[:4])
+
+    pd.testing.assert_index_equal(bf_result, pd_result)