1717
1818use std:: collections:: HashMap ;
1919use std:: ffi:: CString ;
20+ use std:: ptr:: addr_of;
2021use std:: sync:: Arc ;
2122
22- use arrow:: array:: { new_null_array, RecordBatch , RecordBatchReader } ;
23+ use arrow:: array:: { new_null_array, RecordBatch , RecordBatchReader , StructArray } ;
2324use arrow:: compute:: can_cast_types;
2425use arrow:: error:: ArrowError ;
25- use arrow:: ffi:: FFI_ArrowSchema ;
26+ use arrow:: ffi:: { self , FFI_ArrowArray , FFI_ArrowSchema } ;
2627use arrow:: ffi_stream:: FFI_ArrowArrayStream ;
2728use arrow:: pyarrow:: FromPyArrow ;
2829use datafusion:: arrow:: datatypes:: { Schema , SchemaRef } ;
@@ -39,6 +40,7 @@ use datafusion::prelude::*;
3940use datafusion_ffi:: table_provider:: FFI_TableProvider ;
4041use futures:: { StreamExt , TryStreamExt } ;
4142use pyo3:: exceptions:: PyValueError ;
43+ use pyo3:: ffi:: Py_uintptr_t ;
4244use pyo3:: prelude:: * ;
4345use pyo3:: pybacked:: PyBackedStr ;
4446use pyo3:: types:: { PyCapsule , PyList , PyTuple , PyTupleMethods } ;
@@ -526,18 +528,34 @@ impl PyDataFrame {
526528 let batches = wait_for_future ( py, self . df . as_ref ( ) . clone ( ) . collect ( ) ) ?
527529 . map_err ( PyDataFusionError :: from) ?;
528530
529- // Profiling `rb.to_pyarrow(py)` showed that the conversion holds the
530- // Python GIL for almost all of its execution. Serially converting a
531- // large number of batches therefore throttles CPU utilisation. Run the
532- // conversions in Rayon threads and only acquire the GIL when creating
533- // the final PyArrow objects so the CPU intensive work happens in
534- // parallel.
535- py. allow_threads ( move || {
536- batches
537- . into_par_iter ( )
538- . map ( |rb| Python :: with_gil ( |py| rb. to_pyarrow ( py) ) )
539- . collect ( )
540- } )
531+ let ffi_batches: Vec < ( FFI_ArrowArray , FFI_ArrowSchema ) > = py
532+ . allow_threads ( || {
533+ batches
534+ . into_par_iter ( )
535+ . map ( |rb| {
536+ let sa: StructArray = rb. into ( ) ;
537+ ffi:: to_ffi ( & sa. to_data ( ) )
538+ } )
539+ . collect ( )
540+ } )
541+ . map_err ( PyDataFusionError :: from) ?;
542+
543+ let module = py. import ( "pyarrow" ) ?;
544+ let class = module. getattr ( "RecordBatch" ) ?;
545+ ffi_batches
546+ . into_iter ( )
547+ . map ( |( array, schema) | {
548+ class
549+ . call_method1 (
550+ "_import_from_c" ,
551+ (
552+ addr_of ! ( array) as Py_uintptr_t ,
553+ addr_of ! ( schema) as Py_uintptr_t ,
554+ ) ,
555+ )
556+ . map ( Into :: into)
557+ } )
558+ . collect ( )
541559 }
542560
543561 /// Cache DataFrame.
@@ -554,7 +572,34 @@ impl PyDataFrame {
554572
555573 batches
556574 . into_iter ( )
557- . map ( |rbs| rbs. into_iter ( ) . map ( |rb| rb. to_pyarrow ( py) ) . collect ( ) )
575+ . map ( |rbs| {
576+ let ffi_batches: Vec < ( FFI_ArrowArray , FFI_ArrowSchema ) > = py
577+ . allow_threads ( || {
578+ rbs. into_par_iter ( )
579+ . map ( |rb| {
580+ let sa: StructArray = rb. into ( ) ;
581+ ffi:: to_ffi ( & sa. to_data ( ) )
582+ } )
583+ . collect ( )
584+ } )
585+ . map_err ( PyDataFusionError :: from) ?;
586+ let module = py. import ( "pyarrow" ) ?;
587+ let class = module. getattr ( "RecordBatch" ) ?;
588+ ffi_batches
589+ . into_iter ( )
590+ . map ( |( array, schema) | {
591+ class
592+ . call_method1 (
593+ "_import_from_c" ,
594+ (
595+ addr_of ! ( array) as Py_uintptr_t ,
596+ addr_of ! ( schema) as Py_uintptr_t ,
597+ ) ,
598+ )
599+ . map ( Into :: into)
600+ } )
601+ . collect :: < PyResult < Vec < _ > > > ( )
602+ } )
558603 . collect ( )
559604 }
560605
0 commit comments