Skip to content

Commit 0bd920b

Browse files
feat: implement bigframes.bigquery.search function
Implements the `bigframes.bigquery.search` function, which maps to the BigQuery `SEARCH` function. This includes: - Defining `SearchOp` in `bigframes/operations/search_ops.py`. - Implementing the user-facing `search` function in `bigframes/bigquery/_operations/search.py`. - Registering the operation in the Ibis compiler with custom SQL generation to handle BigQuery's named argument syntax (`=>`). - Exposing the function in `bigframes/bigquery/__init__.py`. - Adding unit tests in `tests/unit/bigquery/test_search.py` to verify the expression tree construction.
1 parent 173b83d commit 0bd920b

File tree

6 files changed

+334
-2
lines changed

6 files changed

+334
-2
lines changed

bigframes/bigquery/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,11 @@
5757
to_json,
5858
to_json_string,
5959
)
60-
from bigframes.bigquery._operations.search import create_vector_index, vector_search
60+
from bigframes.bigquery._operations.search import (
61+
create_vector_index,
62+
search,
63+
vector_search,
64+
)
6165
from bigframes.bigquery._operations.sql import sql_scalar
6266
from bigframes.bigquery._operations.struct import struct
6367
from bigframes.core import log_adapter
@@ -99,6 +103,7 @@
99103
to_json_string,
100104
# search ops
101105
create_vector_index,
106+
search,
102107
vector_search,
103108
# sql ops
104109
sql_scalar,
@@ -150,6 +155,7 @@
150155
"to_json_string",
151156
# search ops
152157
"create_vector_index",
158+
"search",
153159
"vector_search",
154160
# sql ops
155161
"sql_scalar",

bigframes/bigquery/_operations/search.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def create_vector_index(
9191
def vector_search(
9292
base_table: str,
9393
column_to_search: str,
94-
query: Union[dataframe.DataFrame, series.Series],
94+
query: Union["dataframe.DataFrame", "series.Series"],
9595
*,
9696
query_column_to_search: Optional[str] = None,
9797
top_k: Optional[int] = None,
@@ -247,3 +247,87 @@ def vector_search(
247247
df = query._session.read_gbq_query(sql, allow_large_results=allow_large_results)
248248

249249
return df
250+
251+
252+
def search(
253+
data_to_search: Union["dataframe.DataFrame", "series.Series"],
254+
search_query: str,
255+
*,
256+
json_scope: Optional[str] = None,
257+
analyzer: Optional[str] = None,
258+
analyzer_options: Optional[str] = None,
259+
) -> series.Series:
260+
"""
261+
The SEARCH function checks to see whether a BigQuery table or other search
262+
data contains a set of search terms (tokens). It returns TRUE if all search
263+
terms appear in the data, based on the rules for search_query and text
264+
analysis described in the text analyzer. Otherwise, this function returns
265+
FALSE.
266+
267+
**Examples:**
268+
269+
>>> import bigframes.pandas as bpd
270+
>>> import bigframes.bigquery as bbq
271+
272+
>>> data = bpd.read_gbq("SELECT 'Please use foobar@example.com as your email.' AS email")
273+
>>> bbq.search(data['email'], 'exam')
274+
0 False
275+
Name: email, dtype: boolean
276+
277+
>>> bbq.search(data['email'], 'foobar')
278+
0 True
279+
Name: email, dtype: boolean
280+
281+
>>> bbq.search(data['email'], 'example.com')
282+
0 True
283+
Name: email, dtype: boolean
284+
285+
Args:
286+
data_to_search (bigframes.dataframe.DataFrame | bigframes.series.Series):
287+
The data to search over.
288+
search_query (str):
289+
A STRING literal, or a STRING constant expression that represents
290+
the terms of the search query.
291+
json_scope (str, optional):
292+
A named argument with a STRING value. Takes one of the following
293+
values to indicate the scope of JSON data to be searched. It has no
294+
effect if data_to_search isn't a JSON value or doesn't contain a
295+
JSON field.
296+
analyzer (str, optional):
297+
A named argument with a STRING value. Takes one of the following
298+
values to indicate the text analyzer to use: 'LOG_ANALYZER',
299+
'NO_OP_ANALYZER', 'PATTERN_ANALYZER'.
300+
analyzer_options (str, optional):
301+
A named argument with a JSON-formatted STRING value. Takes a list
302+
of text analysis rules.
303+
304+
Returns:
305+
bigframes.series.Series: A new Series with the boolean result.
306+
"""
307+
import bigframes.operations.search_ops as search_ops
308+
import bigframes.series
309+
310+
if not isinstance(data_to_search, (bigframes.series.Series, bigframes.dataframe.DataFrame)):
311+
raise ValueError("data_to_search must be a Series or DataFrame")
312+
313+
if isinstance(data_to_search, bigframes.dataframe.DataFrame):
314+
# SEARCH on a table (or dataframe) treats it as a STRUCT
315+
# We need to apply the op on the dataframe, which should handle it as a struct or row
316+
# However, unary ops are usually applied on Series.
317+
# But DataFrame can be passed if we convert it to a struct first?
318+
# Or does DataFrame support _apply_unary_op?
319+
# bigframes.dataframe.DataFrame does not have _apply_unary_op.
320+
# We can convert DataFrame to a Series of Structs.
321+
# But SEARCH in BigQuery can take a table reference which is evaluated as a STRUCT.
322+
# So creating a struct from all columns seems correct.
323+
import bigframes.bigquery._operations.struct as struct_ops
324+
data_to_search = struct_ops.struct(data_to_search)
325+
326+
return data_to_search._apply_unary_op(
327+
search_ops.SearchOp(
328+
search_query=search_query,
329+
json_scope=json_scope,
330+
analyzer=analyzer,
331+
analyzer_options=analyzer_options,
332+
)
333+
)

bigframes/core/compile/ibis_compiler/scalar_op_registry.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2153,6 +2153,43 @@ def str_lstrip_op( # type: ignore[empty-body]
21532153
"""Remove leading and trailing characters."""
21542154

21552155

2156+
@scalar_op_compiler.register_unary_op(ops.SearchOp, pass_op=True)
2157+
def search_op_impl(x: ibis_types.Value, op: ops.SearchOp):
2158+
values = [
2159+
typing.cast(ibis_generic.Value, x.op()),
2160+
typing.cast(ibis_generic.Value, ibis_types.literal(op.search_query).op()),
2161+
]
2162+
sql_template = "SEARCH({0}, {1}"
2163+
arg_index = 2
2164+
if op.json_scope is not None:
2165+
values.append(
2166+
typing.cast(ibis_generic.Value, ibis_types.literal(op.json_scope).op())
2167+
)
2168+
sql_template += f", json_scope=>{{{arg_index}}}"
2169+
arg_index += 1
2170+
if op.analyzer is not None:
2171+
values.append(
2172+
typing.cast(ibis_generic.Value, ibis_types.literal(op.analyzer).op())
2173+
)
2174+
sql_template += f", analyzer=>{{{arg_index}}}"
2175+
arg_index += 1
2176+
if op.analyzer_options is not None:
2177+
values.append(
2178+
typing.cast(
2179+
ibis_generic.Value, ibis_types.literal(op.analyzer_options).op()
2180+
)
2181+
)
2182+
sql_template += f", analyzer_options=>{{{arg_index}}}"
2183+
arg_index += 1
2184+
sql_template += ")"
2185+
2186+
return ibis_generic.SqlScalar(
2187+
ibis_generic.Literal(sql_template, dtype=ibis_dtypes.string),
2188+
values=tuple(values),
2189+
output_type=ibis_dtypes.boolean,
2190+
).to_expr()
2191+
2192+
21562193
@ibis_udf.scalar.builtin(name="rtrim")
21572194
def str_rstrip_op( # type: ignore[empty-body]
21582195
x: ibis_dtypes.String, to_strip: ibis_dtypes.String

bigframes/operations/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@
184184
NaryRemoteFunctionOp,
185185
RemoteFunctionOp,
186186
)
187+
from bigframes.operations.search_ops import SearchOp
187188
from bigframes.operations.string_ops import (
188189
capitalize_op,
189190
EndsWithOp,
@@ -374,6 +375,8 @@
374375
"BinaryRemoteFunctionOp",
375376
"NaryRemoteFunctionOp",
376377
"RemoteFunctionOp",
378+
# Search ops
379+
"SearchOp",
377380
# Frequency ops
378381
"DatetimeToIntegerLabelOp",
379382
"FloorDtOp",

bigframes/operations/search_ops.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import dataclasses
16+
import typing
17+
18+
from bigframes import dtypes
19+
from bigframes.operations import base_ops
20+
21+
22+
@dataclasses.dataclass(frozen=True)
23+
class SearchOp(base_ops.UnaryOp):
24+
name: typing.ClassVar[str] = "search"
25+
search_query: str
26+
json_scope: typing.Optional[str] = None
27+
analyzer: typing.Optional[str] = None
28+
analyzer_options: typing.Optional[str] = None
29+
30+
def output_type(self, *input_types):
31+
return dtypes.BOOL_DTYPE

tests/unit/bigquery/test_search.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pandas as pd
16+
import pytest
17+
18+
import bigframes.bigquery as bbq
19+
import bigframes.operations.search_ops as search_ops
20+
import bigframes.series
21+
import bigframes.session
22+
import bigframes.testing.mocks
23+
24+
25+
@pytest.fixture
26+
def mock_session():
27+
return bigframes.testing.mocks.create_bigquery_session()
28+
29+
30+
def test_search_series(mock_session):
31+
# Use real Series backed by mock session (via read_pandas/ReadLocalNode)
32+
s = bigframes.series.Series(["foo bar", "baz"], session=mock_session)
33+
search_query = "foo"
34+
result = bbq.search(s, search_query)
35+
36+
# Verify the operation in the expression tree
37+
import bigframes.core.nodes as nodes
38+
import bigframes.core.expression as ex
39+
40+
# Get the underlying node
41+
node = result._block.expr.node
42+
43+
# Traverse down to find the ProjectionNode
44+
while isinstance(node, nodes.SelectionNode):
45+
node = node.child
46+
47+
# It should be a ProjectionNode (since search is a unary op applied to existing data)
48+
assert isinstance(node, nodes.ProjectionNode)
49+
50+
# Find the assignment corresponding to the result column
51+
# result._value_column corresponds to one of the output columns of the SelectionNode chain
52+
# But checking the ProjectionNode assignments directly is easier if we iterate through them.
53+
# The SearchOp should be one of the assignments.
54+
55+
# Locate the assignment with SearchOp
56+
assignments = [expr for expr, id in node.assignments if isinstance(expr, ex.OpExpression) and isinstance(expr.op, search_ops.SearchOp)]
57+
assert len(assignments) == 1
58+
assignment = assignments[0]
59+
60+
# The expression should be an OpExpression with SearchOp
61+
assert isinstance(assignment, ex.OpExpression)
62+
assert isinstance(assignment.op, search_ops.SearchOp)
63+
64+
assert assignment.op.search_query == search_query
65+
assert assignment.op.json_scope is None
66+
assert assignment.op.analyzer is None
67+
assert assignment.op.analyzer_options is None
68+
69+
70+
def test_search_series_with_options(mock_session):
71+
s = bigframes.series.Series(["foo bar", "baz"], session=mock_session)
72+
search_query = "foo"
73+
result = bbq.search(
74+
s,
75+
search_query,
76+
json_scope="JSON_VALUES",
77+
analyzer="LOG_ANALYZER",
78+
analyzer_options='{"delimiters": [" "]}',
79+
)
80+
81+
# Verify the operation in the expression tree
82+
import bigframes.core.nodes as nodes
83+
import bigframes.core.expression as ex
84+
85+
# Get the underlying node
86+
node = result._block.expr.node
87+
88+
# Traverse down to find the ProjectionNode
89+
while isinstance(node, nodes.SelectionNode):
90+
node = node.child
91+
92+
# It should be a ProjectionNode
93+
assert isinstance(node, nodes.ProjectionNode)
94+
95+
# Locate the assignment with SearchOp
96+
assignments = [expr for expr, id in node.assignments if isinstance(expr, ex.OpExpression) and isinstance(expr.op, search_ops.SearchOp)]
97+
assert len(assignments) == 1
98+
assignment = assignments[0]
99+
100+
assert isinstance(assignment, ex.OpExpression)
101+
assert isinstance(assignment.op, search_ops.SearchOp)
102+
103+
assert assignment.op.search_query == search_query
104+
assert assignment.op.json_scope == "JSON_VALUES"
105+
assert assignment.op.analyzer == "LOG_ANALYZER"
106+
assert assignment.op.analyzer_options == '{"delimiters": [" "]}'
107+
108+
109+
def test_search_dataframe(mock_session):
110+
# Mock dataframe with 2 columns
111+
df = pd.DataFrame({"col1": ["foo", "bar"], "col2": ["baz", "qux"]})
112+
bf = bigframes.dataframe.DataFrame(df, session=mock_session)
113+
114+
search_query = "foo"
115+
result = bbq.search(bf, search_query)
116+
117+
import bigframes.core.nodes as nodes
118+
import bigframes.core.expression as ex
119+
from bigframes.operations import struct_ops
120+
121+
# Get the underlying node
122+
node = result._block.expr.node
123+
124+
# Traverse down to find the ProjectionNode
125+
while isinstance(node, nodes.SelectionNode):
126+
node = node.child
127+
128+
# Should be a ProjectionNode
129+
assert isinstance(node, nodes.ProjectionNode)
130+
131+
assignments = [expr for expr, id in node.assignments if isinstance(expr, ex.OpExpression) and isinstance(expr.op, search_ops.SearchOp)]
132+
assert len(assignments) == 1
133+
assignment = assignments[0]
134+
135+
assert isinstance(assignment, ex.OpExpression)
136+
assert isinstance(assignment.op, search_ops.SearchOp)
137+
assert assignment.op.search_query == search_query
138+
139+
# Verify that the input to SearchOp is a StructOp
140+
# The input expression to SearchOp
141+
search_input = assignment.inputs[0]
142+
143+
# Since struct() op and search op might be in the same ProjectionNode or different ones.
144+
# If they are in the same ProjectionNode, `search_input` would be a DerefOp to a column not in assignments?
145+
# No, ProjectionNode assignments are parallel. So struct op must be in a child node.
146+
147+
# Check if struct op is in the same node (unlikely for parallel projection unless merged somehow, but typical flow puts them sequential)
148+
149+
# If search_input is DerefOp, we look in the child node.
150+
assert isinstance(search_input, ex.DerefOp)
151+
152+
child_node = node.child
153+
# Traverse SelectionNodes if any
154+
while isinstance(child_node, nodes.SelectionNode):
155+
child_node = child_node.child
156+
157+
# It should be a ProjectionNode (from struct())
158+
assert isinstance(child_node, nodes.ProjectionNode)
159+
160+
# Find the struct assignment
161+
struct_col_id = search_input.id
162+
struct_assignment = next(expr for expr, id in child_node.assignments if id == struct_col_id)
163+
164+
assert isinstance(struct_assignment, ex.OpExpression)
165+
assert isinstance(struct_assignment.op, struct_ops.StructOp)
166+
assert struct_assignment.op.column_names == ("col1", "col2")
167+
168+
169+
def test_search_invalid_input(mock_session):
170+
with pytest.raises(ValueError, match="data_to_search must be a Series or DataFrame"):
171+
bbq.search("invalid", "foo")

0 commit comments

Comments
 (0)