1919import datetime
2020import io
2121import itertools
22+ import math
2223import os
2324import typing
2425from typing import (
@@ -397,6 +398,13 @@ def stream_data(
397398 offsets_col : str ,
398399 ) -> bq_data .BigqueryDataSource :
399400 """Load managed data into bigquery"""
401+ MAX_BYTES = 10000000 # streaming api has 10MB limit
402+ SAFETY_MARGIN = (
403+ 40 # Perf seems bad for large chunks, so do 40x smaller than max
404+ )
405+ batch_count = data .metadata .total_bytes // (MAX_BYTES // SAFETY_MARGIN )
406+ rows_per_batch = math .ceil (data .metadata .row_count / batch_count )
407+
400408 schema_w_offsets = data .schema .append (
401409 schemata .SchemaItem (offsets_col , bigframes .dtypes .INT_DTYPE )
402410 )
@@ -410,16 +418,24 @@ def stream_data(
410418 )
411419 rows_w_offsets = ((* row , offset ) for offset , row in enumerate (rows ))
412420
413- for errors in self ._bqclient .insert_rows (
414- load_table_destination ,
415- rows_w_offsets ,
416- selected_fields = bq_schema ,
417- row_ids = map (str , itertools .count ()), # used to ensure only-once insertion
418- ):
419- if errors :
420- raise ValueError (
421- f"Problem loading at least one row from DataFrame: { errors } . { constants .FEEDBACK_LINK } "
422- )
421+ # TODO: don't use batched
422+ batches = _batched (rows_w_offsets , rows_per_batch )
423+ ids_iter = map (str , itertools .count ())
424+
425+ for batch in batches :
426+ batch_rows = list (batch )
427+ row_ids = itertools .islice (ids_iter , len (batch_rows ))
428+
429+ for errors in self ._bqclient .insert_rows (
430+ load_table_destination ,
431+ batch_rows ,
432+ selected_fields = bq_schema ,
433+ row_ids = row_ids , # used to ensure only-once insertion
434+ ):
435+ if errors :
436+ raise ValueError (
437+ f"Problem loading at least one row from DataFrame: { errors } . { constants .FEEDBACK_LINK } "
438+ )
423439 destination_table = self ._bqclient .get_table (load_table_destination )
424440 return bq_data .BigqueryDataSource (
425441 bq_data .GbqTable .from_table (destination_table ),
@@ -434,6 +450,13 @@ def write_data(
434450 offsets_col : str ,
435451 ) -> bq_data .BigqueryDataSource :
436452 """Load managed data into bigquery"""
453+ MAX_BYTES = 10000000 # streaming api has 10MB limit
454+ SAFETY_MARGIN = (
455+ 4 # aim for 2.5mb to account for row variance, format differences, etc.
456+ )
457+ batch_count = data .metadata .total_bytes // (MAX_BYTES // SAFETY_MARGIN )
458+ rows_per_batch = math .ceil (data .metadata .row_count / batch_count )
459+
437460 schema_w_offsets = data .schema .append (
438461 schemata .SchemaItem (offsets_col , bigframes .dtypes .INT_DTYPE )
439462 )
@@ -450,7 +473,9 @@ def write_data(
450473
451474 def request_gen () -> Generator [bq_storage_types .AppendRowsRequest , None , None ]:
452475 schema , batches = data .to_arrow (
453- offsets_col = offsets_col , duration_type = "int"
476+ offsets_col = offsets_col ,
477+ duration_type = "int" ,
478+ max_chunksize = rows_per_batch ,
454479 )
455480 offset = 0
456481 for batch in batches :
@@ -1332,3 +1357,10 @@ def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype):
13321357 f"Nested JSON types, found in column `{ name } `: `{ column_type } `', "
13331358 f"are currently unsupported for upload. { constants .FEEDBACK_LINK } "
13341359 )
1360+
1361+
1362+ # itertools.batched not available in python <3.12, so we use this instead
1363+ def _batched (iterator : Iterable , n : int ) -> Iterable :
1364+ assert n > 0
1365+ while batch := tuple (itertools .islice (iterator , n )):
1366+ yield batch
0 commit comments