@@ -647,3 +647,226 @@ def foo(x: int) -> int:
647647 container_cpu = 2 ,
648648 container_memory = "64Mi" ,
649649 )(foo )
650+
651+
652+ def test_managed_function_df_apply_axis_1 (session , dataset_id , scalars_dfs ):
653+ columns = ["bool_col" , "int64_col" , "int64_too" , "float64_col" , "string_col" ]
654+ scalars_df , scalars_pandas_df = scalars_dfs
655+ try :
656+
657+ def serialize_row (row ):
658+ # Explicitly casting types ensures consistent behavior between
659+ # BigFrames and pandas. Without it, BigFrames return plain Python
660+ # types, e.g. 0, while pandas return NumPy types, e.g. np.int64(0),
661+ # which could lead to mismatches and requires further investigation.
662+ custom = {
663+ "name" : int (row .name ),
664+ "index" : [idx for idx in row .index ],
665+ "values" : [
666+ val .item () if hasattr (val , "item" ) else val for val in row .values
667+ ],
668+ }
669+
670+ return str (
671+ {
672+ "default" : row .to_json (),
673+ "split" : row .to_json (orient = "split" ),
674+ "records" : row .to_json (orient = "records" ),
675+ "index" : row .to_json (orient = "index" ),
676+ "table" : row .to_json (orient = "table" ),
677+ "custom" : custom ,
678+ }
679+ )
680+
681+ serialize_row_mf = session .udf (
682+ input_types = bigframes .series .Series ,
683+ output_type = str ,
684+ dataset = dataset_id ,
685+ name = prefixer .create_prefix (),
686+ )(serialize_row )
687+
688+ assert getattr (serialize_row_mf , "is_row_processor" )
689+
690+ bf_result = scalars_df [columns ].apply (serialize_row_mf , axis = 1 ).to_pandas ()
691+ pd_result = scalars_pandas_df [columns ].apply (serialize_row , axis = 1 )
692+
693+ # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
694+ # , ignore this mismatch by using check_dtype=False.
695+ pandas .testing .assert_series_equal (pd_result , bf_result , check_dtype = False )
696+
697+ # Let's make sure the read_gbq_function path works for this function.
698+ serialize_row_reuse = session .read_gbq_function (
699+ serialize_row_mf .bigframes_bigquery_function , is_row_processor = True
700+ )
701+ bf_result = scalars_df [columns ].apply (serialize_row_reuse , axis = 1 ).to_pandas ()
702+ pandas .testing .assert_series_equal (pd_result , bf_result , check_dtype = False )
703+
704+ finally :
705+ # clean up the gcp assets created for the managed function.
706+ cleanup_function_assets (
707+ serialize_row_mf , session .bqclient , session .cloudfunctionsclient
708+ )
709+
710+
711+ def test_managed_function_df_apply_axis_1_aggregates (session , dataset_id , scalars_dfs ):
712+ columns = ["int64_col" , "int64_too" , "float64_col" ]
713+ scalars_df , scalars_pandas_df = scalars_dfs
714+
715+ try :
716+
717+ def analyze (row ):
718+ # Explicitly casting types ensures consistent behavior between
719+ # BigFrames and pandas. Without it, BigFrames return plain Python
720+ # types, e.g. 0, while pandas return NumPy types, e.g. np.int64(0),
721+ # which could lead to mismatches and requires further investigation.
722+ return str (
723+ {
724+ "dtype" : row .dtype ,
725+ "count" : int (row .count ()),
726+ "min" : int (row .min ()),
727+ "max" : int (row .max ()),
728+ "mean" : float (row .mean ()),
729+ "std" : float (row .std ()),
730+ "var" : float (row .var ()),
731+ }
732+ )
733+
734+ analyze_mf = session .udf (
735+ input_types = bigframes .series .Series ,
736+ output_type = str ,
737+ dataset = dataset_id ,
738+ name = prefixer .create_prefix (),
739+ )(analyze )
740+
741+ assert getattr (analyze_mf , "is_row_processor" )
742+
743+ bf_result = scalars_df [columns ].dropna ().apply (analyze_mf , axis = 1 ).to_pandas ()
744+ pd_result = scalars_pandas_df [columns ].dropna ().apply (analyze , axis = 1 )
745+
746+ # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
747+ # , ignore this mismatch by using check_dtype=False.
748+ pandas .testing .assert_series_equal (pd_result , bf_result , check_dtype = False )
749+
750+ finally :
751+ # clean up the gcp assets created for the managed function.
752+ cleanup_function_assets (
753+ analyze_mf , session .bqclient , session .cloudfunctionsclient
754+ )
755+
756+
757+ @pytest .mark .parametrize (
758+ ("pd_df" ,),
759+ [
760+ pytest .param (
761+ pandas .DataFrame (
762+ {
763+ "2" : [1 , 2 , 3 ],
764+ 2 : [1.5 , 3.75 , 5 ],
765+ "name, [with. special'- chars\" )/\\ " : [10 , 20 , 30 ],
766+ (3 , 4 ): ["pq" , "rs" , "tu" ],
767+ (5.0 , "six" , 7 ): [8 , 9 , 10 ],
768+ 'raise Exception("hacked!")' : [11 , 12 , 13 ],
769+ },
770+ # Default pandas index has non-numpy type, whereas bigframes is
771+ # always numpy-based type, so let's use the index compatible
772+ # with bigframes. See more details in b/369689696.
773+ index = pandas .Index ([0 , 1 , 2 ], dtype = pandas .Int64Dtype ()),
774+ ),
775+ id = "all-kinds-of-column-names" ,
776+ ),
777+ pytest .param (
778+ pandas .DataFrame (
779+ {
780+ "x" : [1 , 2 , 3 ],
781+ "y" : [1.5 , 3.75 , 5 ],
782+ "z" : ["pq" , "rs" , "tu" ],
783+ },
784+ index = pandas .MultiIndex .from_frame (
785+ pandas .DataFrame (
786+ {
787+ "idx0" : pandas .Series (
788+ ["a" , "a" , "b" ], dtype = pandas .StringDtype ()
789+ ),
790+ "idx1" : pandas .Series (
791+ [100 , 200 , 300 ], dtype = pandas .Int64Dtype ()
792+ ),
793+ }
794+ )
795+ ),
796+ ),
797+ id = "multiindex" ,
798+ marks = pytest .mark .skip (
799+ reason = "TODO: revert this skip after this pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/59908"
800+ ),
801+ ),
802+ pytest .param (
803+ pandas .DataFrame (
804+ [
805+ [10 , 1.5 , "pq" ],
806+ [20 , 3.75 , "rs" ],
807+ [30 , 8.0 , "tu" ],
808+ ],
809+ # Default pandas index has non-numpy type, whereas bigframes is
810+ # always numpy-based type, so let's use the index compatible
811+ # with bigframes. See more details in b/369689696.
812+ index = pandas .Index ([0 , 1 , 2 ], dtype = pandas .Int64Dtype ()),
813+ columns = pandas .MultiIndex .from_arrays (
814+ [
815+ ["first" , "last_two" , "last_two" ],
816+ [1 , 2 , 3 ],
817+ ]
818+ ),
819+ ),
820+ id = "column-multiindex" ,
821+ ),
822+ ],
823+ )
824+ def test_managed_function_df_apply_axis_1_complex (session , dataset_id , pd_df ):
825+ bf_df = session .read_pandas (pd_df )
826+
827+ try :
828+
829+ def serialize_row (row ):
830+ # Explicitly casting types ensures consistent behavior between
831+ # BigFrames and pandas. Without it, BigFrames return plain Python
832+ # types, e.g. 0, while pandas return NumPy types, e.g. np.int64(0),
833+ # which could lead to mismatches and requires further investigation.
834+ custom = {
835+ "name" : int (row .name ),
836+ "index" : [idx for idx in row .index ],
837+ "values" : [
838+ val .item () if hasattr (val , "item" ) else val for val in row .values
839+ ],
840+ }
841+ return str (
842+ {
843+ "default" : row .to_json (),
844+ "split" : row .to_json (orient = "split" ),
845+ "records" : row .to_json (orient = "records" ),
846+ "index" : row .to_json (orient = "index" ),
847+ "custom" : custom ,
848+ }
849+ )
850+
851+ serialize_row_mf = session .udf (
852+ input_types = bigframes .series .Series ,
853+ output_type = str ,
854+ dataset = dataset_id ,
855+ name = prefixer .create_prefix (),
856+ )(serialize_row )
857+
858+ assert getattr (serialize_row_mf , "is_row_processor" )
859+
860+ bf_result = bf_df .apply (serialize_row_mf , axis = 1 ).to_pandas ()
861+ pd_result = pd_df .apply (serialize_row , axis = 1 )
862+
863+ # ignore known dtype difference between pandas and bigframes.
864+ pandas .testing .assert_series_equal (
865+ pd_result , bf_result , check_dtype = False , check_index_type = False
866+ )
867+
868+ finally :
869+ # clean up the gcp assets created for the managed function.
870+ cleanup_function_assets (
871+ serialize_row_mf , session .bqclient , session .cloudfunctionsclient
872+ )
0 commit comments