Skip to content

Commit 683f349

Browse files
Merge remote-tracking branch 'github/main' into polars_semi
2 parents 404f447 + 37666e4 commit 683f349

File tree

16 files changed

+274
-173
lines changed

16 files changed

+274
-173
lines changed

bigframes/operations/blob.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ def get_runtime_json_str(
303303
def exif(
304304
self,
305305
*,
306+
engine: Literal[None, "pillow"] = None,
306307
connection: Optional[str] = None,
307308
max_batching_rows: int = 8192,
308309
container_cpu: Union[float, int] = 0.33,
@@ -311,6 +312,7 @@ def exif(
311312
"""Extract EXIF data. Now only support image types.
312313
313314
Args:
315+
engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
314316
connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
315317
max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function.
316318
container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
@@ -319,6 +321,8 @@ def exif(
319321
Returns:
320322
bigframes.series.Series: JSON series of key-value pairs.
321323
"""
324+
if engine is None or engine.casefold() != "pillow":
325+
raise ValueError("Must specify the engine, supported value is 'pillow'.")
322326

323327
import bigframes.bigquery as bbq
324328
import bigframes.blob._functions as blob_func
@@ -344,6 +348,7 @@ def image_blur(
344348
self,
345349
ksize: tuple[int, int],
346350
*,
351+
engine: Literal[None, "opencv"] = None,
347352
dst: Optional[Union[str, bigframes.series.Series]] = None,
348353
connection: Optional[str] = None,
349354
max_batching_rows: int = 8192,
@@ -354,6 +359,7 @@ def image_blur(
354359
355360
Args:
356361
ksize (tuple(int, int)): Kernel size.
362+
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
357363
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
358364
str: GCS folder str. The output filenames are the same as the input files.
359365
blob Series: The output file paths are determined by the uris of the blob Series.
@@ -367,6 +373,9 @@ def image_blur(
367373
Returns:
368374
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
369375
"""
376+
if engine is None or engine.casefold() != "opencv":
377+
raise ValueError("Must specify the engine, supported value is 'opencv'.")
378+
370379
import bigframes.blob._functions as blob_func
371380

372381
connection = self._resolve_connection(connection)
@@ -424,6 +433,7 @@ def image_resize(
424433
self,
425434
dsize: tuple[int, int] = (0, 0),
426435
*,
436+
engine: Literal[None, "opencv"] = None,
427437
fx: float = 0.0,
428438
fy: float = 0.0,
429439
dst: Optional[Union[str, bigframes.series.Series]] = None,
@@ -436,6 +446,7 @@ def image_resize(
436446
437447
Args:
438448
dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size.
449+
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
439450
fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size.
440451
fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size.
441452
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
@@ -451,6 +462,9 @@ def image_resize(
451462
Returns:
452463
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
453464
"""
465+
if engine is None or engine.casefold() != "opencv":
466+
raise ValueError("Must specify the engine, supported value is 'opencv'.")
467+
454468
dsize_set = dsize[0] > 0 and dsize[1] > 0
455469
fsize_set = fx > 0.0 and fy > 0.0
456470
if not dsize_set ^ fsize_set:
@@ -516,6 +530,7 @@ def image_resize(
516530
def image_normalize(
517531
self,
518532
*,
533+
engine: Literal[None, "opencv"] = None,
519534
alpha: float = 1.0,
520535
beta: float = 0.0,
521536
norm_type: str = "l2",
@@ -528,6 +543,7 @@ def image_normalize(
528543
"""Normalize images.
529544
530545
Args:
546+
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
531547
alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization.
532548
beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization.
533549
norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax".
@@ -544,6 +560,9 @@ def image_normalize(
544560
Returns:
545561
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
546562
"""
563+
if engine is None or engine.casefold() != "opencv":
564+
raise ValueError("Must specify the engine, supported value is 'opencv'.")
565+
547566
import bigframes.blob._functions as blob_func
548567

549568
connection = self._resolve_connection(connection)
@@ -604,6 +623,7 @@ def image_normalize(
604623
def pdf_extract(
605624
self,
606625
*,
626+
engine: Literal[None, "pypdf"] = None,
607627
connection: Optional[str] = None,
608628
max_batching_rows: int = 1,
609629
container_cpu: Union[float, int] = 2,
@@ -613,6 +633,7 @@ def pdf_extract(
613633
"""Extracts text from PDF URLs and saves the text as string.
614634
615635
Args:
636+
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
616637
connection (str or None, default None): BQ connection used for
617638
function internet transactions, and the output blob if "dst"
618639
is str. If None, uses default connection of the session.
@@ -631,6 +652,9 @@ def pdf_extract(
631652
Contains the extracted text from the PDF file.
632653
Includes error messages if verbosity is enabled.
633654
"""
655+
if engine is None or engine.casefold() != "pypdf":
656+
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
657+
634658
import bigframes.bigquery as bbq
635659
import bigframes.blob._functions as blob_func
636660
import bigframes.pandas as bpd
@@ -663,6 +687,7 @@ def pdf_extract(
663687
def pdf_chunk(
664688
self,
665689
*,
690+
engine: Literal[None, "pypdf"] = None,
666691
connection: Optional[str] = None,
667692
chunk_size: int = 2000,
668693
overlap_size: int = 200,
@@ -675,6 +700,7 @@ def pdf_chunk(
675700
arrays of strings.
676701
677702
Args:
703+
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
678704
connection (str or None, default None): BQ connection used for
679705
function internet transactions, and the output blob if "dst"
680706
is str. If None, uses default connection of the session.
@@ -698,6 +724,8 @@ def pdf_chunk(
698724
where each string is a chunk of text extracted from PDF.
699725
Includes error messages if verbosity is enabled.
700726
"""
727+
if engine is None or engine.casefold() != "pypdf":
728+
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
701729

702730
import bigframes.bigquery as bbq
703731
import bigframes.blob._functions as blob_func
@@ -740,6 +768,7 @@ def pdf_chunk(
740768
def audio_transcribe(
741769
self,
742770
*,
771+
engine: Literal["bigquery"] = "bigquery",
743772
connection: Optional[str] = None,
744773
model_name: Optional[
745774
Literal[
@@ -753,6 +782,7 @@ def audio_transcribe(
753782
Transcribe audio content using a Gemini multimodal model.
754783
755784
Args:
785+
engine ('bigquery'): The engine (bigquery or third party library) used for the function.
756786
connection (str or None, default None): BQ connection used for
757787
function internet transactions, and the output blob if "dst"
758788
is str. If None, uses default connection of the session.
@@ -770,6 +800,9 @@ def audio_transcribe(
770800
Contains the transcribed text from the audio file.
771801
Includes error messages if verbosity is enabled.
772802
"""
803+
if engine.casefold() != "bigquery":
804+
raise ValueError("Must specify the engine, supported value is 'bigquery'.")
805+
773806
import bigframes.bigquery as bbq
774807
import bigframes.ml.llm as llm
775808
import bigframes.pandas as bpd

bigframes/testing/mocks.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def create_bigquery_session(
4141
bqclient: Optional[mock.Mock] = None,
4242
session_id: str = "abcxyz",
4343
table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA,
44+
table_name: str = "test_table",
4445
anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None,
4546
location: str = "test-region",
4647
ordering_mode: Literal["strict", "partial"] = "partial",
@@ -76,7 +77,7 @@ def create_bigquery_session(
7677
type(table).schema = mock.PropertyMock(return_value=table_schema)
7778
type(table).project = anonymous_dataset.project
7879
type(table).dataset_id = anonymous_dataset.dataset_id
79-
type(table).table_id = "test_table"
80+
type(table).table_id = table_name
8081
type(table).num_rows = mock.PropertyMock(return_value=1000000000)
8182
bqclient.get_table.return_value = table
8283

@@ -94,7 +95,7 @@ def query_mock(
9495
query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True)
9596
query_job._properties = {}
9697
type(query_job).destination = mock.PropertyMock(
97-
return_value=anonymous_dataset.table("test_table"),
98+
return_value=anonymous_dataset.table(table_name),
9899
)
99100
type(query_job).statement_type = mock.PropertyMock(return_value="SELECT")
100101

notebooks/multimodal/multimodal_dataframe.ipynb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,16 +254,17 @@
254254
"outputs": [],
255255
"source": [
256256
"df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n",
257-
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\"\n",
257+
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n",
258258
")\n",
259259
"df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n",
260-
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\"\n",
260+
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n",
261261
")\n",
262262
"df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n",
263263
" alpha=50.0,\n",
264264
" beta=150.0,\n",
265265
" norm_type=\"minmax\",\n",
266266
" dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n",
267+
" engine=\"opencv\",\n",
267268
")"
268269
]
269270
},
@@ -280,7 +281,7 @@
280281
"outputs": [],
281282
"source": [
282283
"# You can also chain functions together\n",
283-
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\")"
284+
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")"
284285
]
285286
},
286287
{
@@ -419,7 +420,7 @@
419420
},
420421
"outputs": [],
421422
"source": [
422-
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()"
423+
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
423424
]
424425
},
425426
{

samples/snippets/multimodal_test.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,22 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
5656

5757
# [START bigquery_dataframes_multimodal_dataframe_image_transform]
5858
df_image["blurred"] = df_image["image"].blob.image_blur(
59-
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/"
59+
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/", engine="opencv"
6060
)
6161
df_image["resized"] = df_image["image"].blob.image_resize(
62-
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/"
62+
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/", engine="opencv"
6363
)
6464
df_image["normalized"] = df_image["image"].blob.image_normalize(
6565
alpha=50.0,
6666
beta=150.0,
6767
norm_type="minmax",
6868
dst=f"{dst_bucket}/image_normalize_transformed/",
69+
engine="opencv",
6970
)
7071

7172
# You can also chain functions together
7273
df_image["blur_resized"] = df_image["blurred"].blob.image_resize(
73-
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/"
74+
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/", engine="opencv"
7475
)
7576
df_image
7677
# [END bigquery_dataframes_multimodal_dataframe_image_transform]
@@ -113,7 +114,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
113114
df_pdf = bpd.from_glob_path(
114115
"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf"
115116
)
116-
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk()
117+
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk(engine="pypdf")
117118
chunked = df_pdf["chunked"].explode()
118119
chunked
119120
# [END bigquery_dataframes_multimodal_dataframe_pdf_chunk]

0 commit comments

Comments
 (0)