@@ -43,20 +43,42 @@ def wrapper(*args, **kwargs):
4343
4444
4545def make_host_memory (size , number , dtype , fill = False ):
46+ element_size = np .dtype (dtype ).itemsize
47+ num_elements = size // element_size
4648 host = cupy .cuda .alloc_pinned_memory (size * number )
47- host_np = np .frombuffer (host , dtype = dtype )
49+ host_np = np .frombuffer (host , dtype = dtype , count = num_elements )
4850 if fill :
4951 fixed_len = min (1024 , number )
5052 host_np [:fixed_len ] = np .arange (fixed_len , dtype = dtype )
5153 print ("make:" , host_np .shape , host_np .itemsize , host_np )
5254 return host
5355
5456
55- def compare (host1 , host2 , dtype ):
56- host1_np = np .frombuffer (host1 , dtype = dtype )
57- host2_np = np .frombuffer (host2 , dtype = dtype )
58- print ("compare[1]:" , host1_np .shape , host1_np .itemsize , host1_np )
59- print ("compare[2]:" , host2_np .shape , host2_np .itemsize , host2_np )
57+ def make_batch_host_memory (size , number , dtype , fill = False ):
58+ element_size = np .dtype (dtype ).itemsize
59+ num_elements = size // element_size
60+ host = []
61+ for i in range (number ):
62+ pinned_mem = cupy .cuda .alloc_pinned_memory (size )
63+ np_array = np .frombuffer (pinned_mem , dtype = dtype , count = num_elements )
64+ if fill :
65+ value = np .uint64 (1023 + i )
66+ np_array [0 ] = value
67+ np_array [- 1 ] = value
68+ host .append (pinned_mem )
69+ if i == 0 :
70+ print ("make:" , np_array .shape , np_array .itemsize , np_array )
71+ return host
72+
73+
74+ def compare (host1 , host2 , size , dtype , show_detail = True ):
75+ element_size = np .dtype (dtype ).itemsize
76+ num_elements = size // element_size
77+ host1_np = np .frombuffer (host1 , dtype = dtype , count = num_elements )
78+ host2_np = np .frombuffer (host2 , dtype = dtype , count = num_elements )
79+ if show_detail :
80+ print ("compare[1]:" , host1_np .shape , host1_np .itemsize , host1_np )
81+ print ("compare[2]:" , host2_np .shape , host2_np .itemsize , host2_np )
6082 return np .array_equal (host1_np , host2_np )
6183
6284
@@ -73,7 +95,7 @@ def trans_with_ce(d, size, number, dtype):
7395 cost = time .perf_counter () - tp
7496 print (f"cost: { cost } s" )
7597 print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
76- assert compare (host1 , host2 , dtype )
98+ assert compare (host1 , host2 , size , dtype )
7799
78100
79101@test_wrap
@@ -91,7 +113,7 @@ def trans_with_sm(d, size, number, dtype):
91113 cost = time .perf_counter () - tp
92114 print (f"cost: { cost } s" )
93115 print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
94- assert compare (host1 , host2 , dtype )
116+ assert compare (host1 , host2 , size , dtype )
95117
96118
97119@test_wrap
@@ -108,7 +130,7 @@ def trans_with_ce_async(d, size, number, dtype):
108130 cost = time .perf_counter () - tp
109131 print (f"cost: { cost } s" )
110132 print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
111- assert compare (host1 , host2 , dtype )
133+ assert compare (host1 , host2 , size , dtype )
112134
113135
114136@test_wrap
@@ -127,7 +149,97 @@ def trans_with_sm_async(d, size, number, dtype):
127149 cost = time .perf_counter () - tp
128150 print (f"cost: { cost } s" )
129151 print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
130- assert compare (host1 , host2 , dtype )
152+ assert compare (host1 , host2 , size , dtype )
153+
154+
155+ @test_wrap
156+ def trans_batch_with_ce (d , size , number , dtype ):
157+ s = d .MakeStream ()
158+ host1 = make_batch_host_memory (size , number , dtype , True )
159+ host1_ptr = np .array ([h .ptr for h in host1 ], dtype = np .uint64 )
160+ device = [cupy .empty (size , dtype = np .uint8 ) for _ in range (number )]
161+ device_ptr = np .array ([d .data .ptr for d in device ], dtype = np .uint64 )
162+ host2 = make_batch_host_memory (size , number , dtype )
163+ host2_ptr = np .array ([h .ptr for h in host2 ], dtype = np .uint64 )
164+ tp = time .perf_counter ()
165+ s .HostToDeviceBatch (host1_ptr , device_ptr , size , number )
166+ s .DeviceToHostBatch (device_ptr , host2_ptr , size , number )
167+ cost = time .perf_counter () - tp
168+ print (f"cost: { cost } s" )
169+ print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
170+ for h1 , h2 in zip (host1 , host2 ):
171+ assert compare (h1 , h2 , size , dtype , False )
172+
173+
174+ @test_wrap
175+ def trans_batch_with_sm (dev , size , number , dtype ):
176+ s = dev .MakeSMStream ()
177+ h1 = make_batch_host_memory (size , number , dtype , True )
178+ h1_ptr = np .array ([h .ptr for h in h1 ], dtype = np .uint64 )
179+ h1_ptr_cupy = cupy .empty (number , dtype = np .uint64 )
180+ h1_ptr_cupy .set (h1_ptr )
181+ d = [cupy .empty (size , dtype = np .uint8 ) for _ in range (number )]
182+ d_ptr = np .array ([d .data .ptr for d in d ], dtype = np .uint64 )
183+ d_ptr_cupy = cupy .empty (number , dtype = np .uint64 )
184+ d_ptr_cupy .set (d_ptr )
185+ h2 = make_batch_host_memory (size , number , dtype )
186+ h2_ptr = np .array ([h .ptr for h in h2 ], dtype = np .uint64 )
187+ h2_ptr_cupy = cupy .empty (number , dtype = np .uint64 )
188+ h2_ptr_cupy .set (h2_ptr )
189+ tp = time .perf_counter ()
190+ s .HostToDeviceBatch (h1_ptr_cupy .data .ptr , d_ptr_cupy .data .ptr , size , number )
191+ s .DeviceToHostBatch (d_ptr_cupy .data .ptr , h2_ptr_cupy .data .ptr , size , number )
192+ cost = time .perf_counter () - tp
193+ print (f"cost: { cost } s" )
194+ print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
195+ for x , y in zip (h1 , h2 ):
196+ assert compare (x , y , size , dtype , False )
197+
198+
199+ @test_wrap
200+ def trans_batch_with_ce_async (d , size , number , dtype ):
201+ s = d .MakeStream ()
202+ host1 = make_batch_host_memory (size , number , dtype , True )
203+ host1_ptr = np .array ([h .ptr for h in host1 ], dtype = np .uint64 )
204+ device = [cupy .empty (size , dtype = np .uint8 ) for _ in range (number )]
205+ device_ptr = np .array ([d .data .ptr for d in device ], dtype = np .uint64 )
206+ host2 = make_batch_host_memory (size , number , dtype )
207+ host2_ptr = np .array ([h .ptr for h in host2 ], dtype = np .uint64 )
208+ tp = time .perf_counter ()
209+ s .HostToDeviceBatchAsync (host1_ptr , device_ptr , size , number )
210+ s .DeviceToHostBatchAsync (device_ptr , host2_ptr , size , number )
211+ s .Synchronized ()
212+ cost = time .perf_counter () - tp
213+ print (f"cost: { cost } s" )
214+ print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
215+ for h1 , h2 in zip (host1 , host2 ):
216+ assert compare (h1 , h2 , size , dtype , False )
217+
218+
219+ @test_wrap
220+ def trans_batch_with_sm_async (dev , size , number , dtype ):
221+ s = dev .MakeSMStream ()
222+ h1 = make_batch_host_memory (size , number , dtype , True )
223+ h1_ptr = np .array ([h .ptr for h in h1 ], dtype = np .uint64 )
224+ h1_ptr_cupy = cupy .empty (number , dtype = np .uint64 )
225+ h1_ptr_cupy .set (h1_ptr )
226+ d = [cupy .empty (size , dtype = np .uint8 ) for _ in range (number )]
227+ d_ptr = np .array ([d .data .ptr for d in d ], dtype = np .uint64 )
228+ d_ptr_cupy = cupy .empty (number , dtype = np .uint64 )
229+ d_ptr_cupy .set (d_ptr )
230+ h2 = make_batch_host_memory (size , number , dtype )
231+ h2_ptr = np .array ([h .ptr for h in h2 ], dtype = np .uint64 )
232+ h2_ptr_cupy = cupy .empty (number , dtype = np .uint64 )
233+ h2_ptr_cupy .set (h2_ptr )
234+ tp = time .perf_counter ()
235+ s .HostToDeviceBatchAsync (h1_ptr_cupy .data .ptr , d_ptr_cupy .data .ptr , size , number )
236+ s .DeviceToHostBatchAsync (d_ptr_cupy .data .ptr , h2_ptr_cupy .data .ptr , size , number )
237+ s .Synchronized ()
238+ cost = time .perf_counter () - tp
239+ print (f"cost: { cost } s" )
240+ print (f"bandwidth: { size * number / cost / 1e9 } GB/s" )
241+ for x , y in zip (h1 , h2 ):
242+ assert compare (x , y , size , dtype , False )
131243
132244
133245def main ():
@@ -143,6 +255,10 @@ def main():
143255 trans_with_sm (d , size , number , dtype )
144256 trans_with_ce_async (d , size , number , dtype )
145257 trans_with_sm_async (d , size , number , dtype )
258+ trans_batch_with_ce (d , size , number , dtype )
259+ trans_batch_with_sm (d , size , number , dtype )
260+ trans_batch_with_ce_async (d , size , number , dtype )
261+ trans_batch_with_sm_async (d , size , number , dtype )
146262
147263
148264if __name__ == "__main__" :
0 commit comments