@@ -143,6 +143,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
143143 packed_host_axes_shapes_strides_shp->begin () + k);
144144 }
145145 else {
146+ // FIXME: this pointer was not allocated in this function
147+ // the caller should be freeing it
146148 sycl::free (device_orthog_shapes_strides, exec_q);
147149 throw std::runtime_error (" Invalid array encountered" );
148150 }
@@ -190,6 +192,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
190192 packed_host_axes_shapes_strides_shp->begin () + 2 * k);
191193 }
192194 else {
195+ // FIXME: this pointer was not allocated in this function
196+ // the caller should be freeing it
193197 sycl::free (device_orthog_shapes_strides, exec_q);
194198 throw std::runtime_error (" Invalid array encountered" );
195199 }
@@ -255,6 +259,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
255259 packed_host_axes_shapes_strides_shp->begin () + 2 );
256260 }
257261 else {
262+ // FIXME: memory was not allocated in this function
263+ // it should be freed by the caller
258264 sycl::free (device_orthog_shapes_strides, exec_q);
259265 throw std::runtime_error (" Invalid array encountered" );
260266 }
@@ -292,23 +298,24 @@ std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
292298 std::vector<dpctl::tensor::usm_ndarray> res;
293299 res.reserve (ind_count);
294300
295- bool acquired = false ;
301+ bool nd_is_known = false ;
296302 int nd = -1 ;
297303 for (size_t i = 0 ; i < ind_count; ++i) {
298- auto el_i = py_ind[py::cast (i)];
299- auto arr_i = py::cast<dpctl::tensor::usm_ndarray>(el_i);
304+ py::object el_i = py_ind[py::cast (i)];
305+ dpctl::tensor::usm_ndarray arr_i =
306+ py::cast<dpctl::tensor::usm_ndarray>(el_i);
300307 if (!dpctl::utils::queues_are_compatible (q, {arr_i})) {
301308 throw py::value_error (" Index allocation queue is not compatible "
302309 " with execution queue" );
303310 }
304- if (acquired ) {
311+ if (nd_is_known ) {
305312 if (nd != arr_i.get_ndim ()) {
306313 throw py::value_error (
307314 " Indices must have the same number of dimensions." );
308315 }
309316 }
310317 else {
311- acquired = true ;
318+ nd_is_known = true ;
312319 nd = arr_i.get_ndim ();
313320 }
314321 res.push_back (arr_i);
@@ -558,6 +565,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
558565 sycl::malloc_device<py::ssize_t >((k + 1 ) * ind_sh_elems, exec_q);
559566
560567 if (packed_ind_shapes_strides == nullptr ) {
568+ sycl::free (packed_ind_ptrs, exec_q);
561569 throw std::runtime_error (
562570 " Unable to allocate packed_ind_shapes_strides device memory" );
563571 }
@@ -566,6 +574,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
566574 sycl::malloc_device<py::ssize_t >(k, exec_q);
567575
568576 if (packed_ind_offsets == nullptr ) {
577+ sycl::free (packed_ind_ptrs, exec_q);
578+ sycl::free (packed_ind_shapes_strides, exec_q);
569579 throw std::runtime_error (
570580 " Unable to allocate packed_ind_offsets device memory" );
571581 }
@@ -595,33 +605,29 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
595605 std::copy (ind_offsets.begin (), ind_offsets.end (),
596606 host_ind_offsets_shp->begin ());
597607
598- std::vector<sycl::event> host_task_events (5 );
608+ std::vector<sycl::event> host_task_events;
609+ host_task_events.reserve (5 );
599610
600611 sycl::event packed_ind_ptrs_copy_ev = exec_q.copy <char *>(
601612 host_ind_ptrs_shp->data (), packed_ind_ptrs, host_ind_ptrs_shp->size ());
602- sycl::event ind_ptrs_host_task = exec_q.submit ([&](sycl::handler &cgh) {
603- cgh.depends_on (packed_ind_ptrs_copy_ev);
604- cgh.host_task ([host_ind_ptrs_shp]() {});
605- });
606- host_task_events.push_back (ind_ptrs_host_task);
607613
608614 sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy <py::ssize_t >(
609615 host_ind_shapes_strides_shp->data (), packed_ind_shapes_strides,
610616 host_ind_shapes_strides_shp->size ());
611- sycl::event ind_sh_st_host_task = exec_q.submit ([&](sycl::handler &cgh) {
612- cgh.depends_on (packed_ind_shapes_strides_copy_ev);
613- cgh.host_task ([host_ind_shapes_strides_shp]() {});
614- });
615- host_task_events.push_back (ind_sh_st_host_task);
616617
617618 sycl::event packed_ind_offsets_copy_ev = exec_q.copy <py::ssize_t >(
618619 host_ind_offsets_shp->data (), packed_ind_offsets,
619620 host_ind_offsets_shp->size ());
620- sycl::event ind_offsets_host_task = exec_q.submit ([&](sycl::handler &cgh) {
621- cgh.depends_on (packed_ind_offsets_copy_ev);
622- cgh.host_task ([host_ind_offsets_shp]() {});
623- });
624- host_task_events.push_back (ind_offsets_host_task);
621+
622+ sycl::event shared_ptr_cleanup_host_task =
623+ exec_q.submit ([&](sycl::handler &cgh) {
624+ cgh.depends_on ({packed_ind_offsets_copy_ev,
625+ packed_ind_shapes_strides_copy_ev,
626+ packed_ind_ptrs_copy_ev});
627+ cgh.host_task ([host_ind_offsets_shp, host_ind_shapes_strides_shp,
628+ host_ind_ptrs_shp]() {});
629+ });
630+ host_task_events.push_back (shared_ptr_cleanup_host_task);
625631
626632 std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
627633 packed_ind_shapes_strides_copy_ev,
@@ -643,6 +649,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
643649 sycl::malloc_device<py::ssize_t >(3 * sh_elems, exec_q);
644650
645651 if (packed_shapes_strides == nullptr ) {
652+ sycl::event::wait (host_task_events);
653+ sycl::free (packed_ind_ptrs, exec_q);
654+ sycl::free (packed_ind_shapes_strides, exec_q);
655+ sycl::free (packed_ind_offsets, exec_q);
646656 throw std::runtime_error (
647657 " Unable to allocate packed_shapes_strides device memory" );
648658 }
@@ -654,6 +664,11 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
654664 sycl::malloc_device<py::ssize_t >((2 * k) + ind_sh_elems, exec_q);
655665
656666 if (packed_axes_shapes_strides == nullptr ) {
667+ sycl::event::wait (host_task_events);
668+ sycl::free (packed_ind_ptrs, exec_q);
669+ sycl::free (packed_ind_shapes_strides, exec_q);
670+ sycl::free (packed_ind_offsets, exec_q);
671+ sycl::free (packed_shapes_strides, exec_q);
657672 throw std::runtime_error (
658673 " Unable to allocate packed_axes_shapes_strides device memory" );
659674 }
@@ -665,8 +680,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
665680 is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig,
666681 is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd);
667682
668- std::vector<sycl::event> all_deps (depends.size () + ind_pack_depends.size () +
669- src_dst_pack_deps.size ());
683+ std::vector<sycl::event> all_deps;
684+ all_deps.reserve (depends.size () + ind_pack_depends.size () +
685+ src_dst_pack_deps.size ());
670686 all_deps.insert (std::end (all_deps), std::begin (ind_pack_depends),
671687 std::end (ind_pack_depends));
672688 all_deps.insert (std::end (all_deps), std::begin (src_dst_pack_deps),
@@ -676,6 +692,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
676692 auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
677693
678694 if (fn == nullptr ) {
695+ sycl::event::wait (host_task_events);
696+ sycl::free (packed_ind_ptrs, exec_q);
697+ sycl::free (packed_ind_shapes_strides, exec_q);
698+ sycl::free (packed_ind_offsets, exec_q);
699+ sycl::free (packed_shapes_strides, exec_q);
700+ sycl::free (packed_axes_shapes_strides, exec_q);
679701 throw std::runtime_error (" Indices must be integer type, got " +
680702 std::to_string (ind_type_id));
681703 }
@@ -687,7 +709,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
687709 src_offset, dst_offset, packed_ind_offsets, all_deps);
688710
689711 // free packed temporaries
690- exec_q.submit ([&](sycl::handler &cgh) {
712+ sycl::event temporaries_cleanup_ev = exec_q.submit ([&](sycl::handler &cgh) {
691713 cgh.depends_on (take_generic_ev);
692714 auto ctx = exec_q.get_context ();
693715 cgh.host_task ([packed_shapes_strides, packed_axes_shapes_strides,
@@ -700,12 +722,16 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
700722 sycl::free (packed_ind_offsets, ctx);
701723 });
702724 });
703- host_task_events.push_back (take_generic_ev);
704725
705- sycl::event host_task_ev =
706- keep_args_alive (exec_q, {src, py_ind, dst}, host_task_events);
726+ sycl::event::wait (host_task_events);
727+ sycl::event::wait ({take_generic_ev, temporaries_cleanup_ev});
728+
729+ /*
730+ sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst},
731+ {temporaries_cleanup_ev});
732+ */
707733
708- return std::make_pair (host_task_ev, take_generic_ev );
734+ return std::make_pair (sycl::event (), temporaries_cleanup_ev );
709735}
710736
711737std::pair<sycl::event, sycl::event>
@@ -951,6 +977,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
951977 sycl::malloc_device<py::ssize_t >((k + 1 ) * ind_sh_elems, exec_q);
952978
953979 if (packed_ind_shapes_strides == nullptr ) {
980+ sycl::free (packed_ind_ptrs, exec_q);
954981 throw std::runtime_error (
955982 " Unable to allocate packed_ind_shapes_strides device memory" );
956983 }
@@ -959,6 +986,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
959986 sycl::malloc_device<py::ssize_t >(k, exec_q);
960987
961988 if (packed_ind_offsets == nullptr ) {
989+ sycl::free (packed_ind_ptrs, exec_q);
990+ sycl::free (packed_ind_shapes_strides, exec_q);
962991 throw std::runtime_error (
963992 " Unable to allocate packed_ind_offsets device memory" );
964993 }
@@ -988,7 +1017,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
9881017 std::copy (ind_offsets.begin (), ind_offsets.end (),
9891018 host_ind_offsets_shp->begin ());
9901019
991- std::vector<sycl::event> host_task_events (5 );
1020+ std::vector<sycl::event> host_task_events;
1021+ host_task_events.reserve (7 );
9921022
9931023 sycl::event device_ind_ptrs_copy_ev = exec_q.copy <char *>(
9941024 host_ind_ptrs_shp->data (), packed_ind_ptrs, host_ind_ptrs_shp->size ());
@@ -1036,6 +1066,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10361066 sycl::malloc_device<py::ssize_t >(3 * sh_elems, exec_q);
10371067
10381068 if (packed_shapes_strides == nullptr ) {
1069+ sycl::event::wait (host_task_events);
1070+ sycl::free (packed_ind_ptrs, exec_q);
1071+ sycl::free (packed_ind_shapes_strides, exec_q);
1072+ sycl::free (packed_ind_offsets, exec_q);
10391073 throw std::runtime_error (
10401074 " Unable to allocate packed_shapes_strides device memory" );
10411075 }
@@ -1047,6 +1081,11 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10471081 sycl::malloc_device<py::ssize_t >((2 * k) + ind_sh_elems, exec_q);
10481082
10491083 if (packed_axes_shapes_strides == nullptr ) {
1084+ sycl::event::wait (host_task_events);
1085+ sycl::free (packed_shapes_strides, exec_q);
1086+ sycl::free (packed_ind_ptrs, exec_q);
1087+ sycl::free (packed_ind_shapes_strides, exec_q);
1088+ sycl::free (packed_ind_offsets, exec_q);
10501089 throw std::runtime_error (
10511090 " Unable to allocate packed_axes_shapes_strides device memory" );
10521091 }
@@ -1070,6 +1109,13 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10701109 auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
10711110
10721111 if (fn == nullptr ) {
1112+ sycl::event::wait (host_task_events);
1113+ sycl::free (packed_shapes_strides, exec_q);
1114+ sycl::free (packed_axes_shapes_strides, exec_q);
1115+ sycl::free (packed_ind_shapes_strides, exec_q);
1116+ sycl::free (packed_ind_ptrs, exec_q);
1117+ sycl::free (packed_ind_offsets, exec_q);
1118+
10731119 throw std::runtime_error (" Indices must be integer type, got " +
10741120 std::to_string (ind_type_id));
10751121 }
@@ -1081,9 +1127,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10811127 dst_offset, val_offset, packed_ind_offsets, all_deps);
10821128
10831129 // free packed temporaries
1084- auto ctx = exec_q. get_context ();
1085- exec_q.submit ([&](sycl::handler &cgh) {
1130+
1131+ sycl::event temporaries_cleanup_ev = exec_q.submit ([&](sycl::handler &cgh) {
10861132 cgh.depends_on (put_generic_ev);
1133+ auto ctx = exec_q.get_context ();
10871134 cgh.host_task ([packed_shapes_strides, packed_axes_shapes_strides,
10881135 packed_ind_shapes_strides, packed_ind_ptrs,
10891136 packed_ind_offsets, ctx]() {
@@ -1094,11 +1141,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10941141 sycl::free (packed_ind_offsets, ctx);
10951142 });
10961143 });
1097- host_task_events.push_back (put_generic_ev);
10981144
1099- return std::make_pair (
1100- keep_args_alive (exec_q, {dst, py_ind, val}, host_task_events),
1101- put_generic_ev);
1145+ sycl::event::wait (host_task_events);
1146+ sycl::event::wait ({put_generic_ev, temporaries_cleanup_ev});
1147+
1148+ /*
1149+ sycl::event py_obj_cleanup_ev =
1150+ keep_args_alive(exec_q, {dst, py_ind, val},
1151+ {put_generic_ev, temporaries_cleanup_ev});
1152+ */
1153+
1154+ return std::make_pair (sycl::event (), temporaries_cleanup_ev);
11021155}
11031156
11041157void init_advanced_indexing_dispatch_tables (void )
0 commit comments