Skip to content

Commit ae387d3

Browse files
committed
adding wip benchmark example
1 parent 3e91b0f commit ae387d3

File tree

1 file changed

+162
-6
lines changed

1 file changed

+162
-6
lines changed

src/offload/usage.md

Lines changed: 162 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ This feature is work-in-progress, and not ready for usage. The instructions here
44
We currently work on launching the following Rust kernel on the GPU. To follow along, copy it to a `src/lib.rs` file.
55

66
```rust
7+
#![feature(link_llvm_intrinsics)]
78
#![feature(abi_gpu_kernel)]
89
#![feature(rustc_attrs)]
910
#![feature(core_intrinsics)]
@@ -39,9 +40,52 @@ fn main() {
3940
};
4041
libc::printf(val, (*array_c)[0]);
4142
}
43+
let mut div = [1.0, 2.0, 3.0, 4.0];
44+
let x1 = [1.0, 2.0, 3.0, 4.0];
45+
let x2 = [1.0, 2.0, 3.0, 4.0];
46+
let x3 = [1.0, 2.0, 3.0, 4.0];
47+
let x4 = [1.0, 2.0, 3.0, 4.0];
48+
let y1 = [1.0, 2.0, 3.0, 4.0];
49+
let y2 = [1.0, 2.0, 3.0, 4.0];
50+
let y3 = [1.0, 2.0, 3.0, 4.0];
51+
let y4 = [1.0, 2.0, 3.0, 4.0];
52+
let fx1 = [1.0, 2.0, 3.0, 4.0];
53+
let fx2 = [1.0, 2.0, 3.0, 4.0];
54+
let fx3 = [1.0, 2.0, 3.0, 4.0];
55+
let fx4 = [1.0, 2.0, 3.0, 4.0];
56+
let fy1 = [1.0, 2.0, 3.0, 4.0];
57+
let fy2 = [1.0, 2.0, 3.0, 4.0];
58+
let fy3 = [1.0, 2.0, 3.0, 4.0];
59+
let fy4 = [1.0, 2.0, 3.0, 4.0];
60+
let real_zones = [0, 0, 0, 0];
61+
let half = 1.0;
62+
let ptiny = 0.001;
63+
let iend = 4;
4264

4365
unsafe {
44-
kernel(array_c);
66+
kernel(
67+
&mut div,
68+
&x1,
69+
&x2,
70+
&x3,
71+
&x4,
72+
&y1,
73+
&y2,
74+
&y3,
75+
&y4,
76+
&fx1,
77+
&fx2,
78+
&fx3,
79+
&fx4,
80+
&fy1,
81+
&fy2,
82+
&fy3,
83+
&fy4,
84+
&real_zones,
85+
&half,
86+
&ptiny,
87+
&iend,
88+
);
4589
}
4690
core::hint::black_box(&array_c);
4791
unsafe {
@@ -56,21 +100,133 @@ fn main() {
56100
}
57101

58102
#[inline(never)]
59-
unsafe fn kernel(x: *mut [f64; 256]) {
60-
core::intrinsics::offload(kernel_1, (x,))
103+
unsafe fn kernel(
104+
div: &mut [f32; 4],
105+
x1: &[f32; 4],
106+
x2: &[f32; 4],
107+
x3: &[f32; 4],
108+
x4: &[f32; 4],
109+
y1: &[f32; 4],
110+
y2: &[f32; 4],
111+
y3: &[f32; 4],
112+
y4: &[f32; 4],
113+
fx1: &[f32; 4],
114+
fx2: &[f32; 4],
115+
fx3: &[f32; 4],
116+
fx4: &[f32; 4],
117+
fy1: &[f32; 4],
118+
fy2: &[f32; 4],
119+
fy3: &[f32; 4],
120+
fy4: &[f32; 4],
121+
real_zones: &[usize; 4],
122+
half: &f32,
123+
ptiny: &f32,
124+
iend: &usize,
125+
) {
126+
core::intrinsics::offload(
127+
kernel_1,
128+
(
129+
div, x1, x2, x3, x4, y1, y2, y3, y4, fx1, fx2, fx3, fx4, fy1, fy2, fy3, fy4,
130+
real_zones, half, ptiny, iend,
131+
),
132+
)
61133
}
62134

63135
#[cfg(target_os = "linux")]
64136
unsafe extern "C" {
65-
pub fn kernel_1(array_b: *mut [f64; 256]);
137+
pub fn kernel_1(
138+
div: &mut [f32; 4],
139+
x1: &[f32; 4],
140+
x2: &[f32; 4],
141+
x3: &[f32; 4],
142+
x4: &[f32; 4],
143+
y1: &[f32; 4],
144+
y2: &[f32; 4],
145+
y3: &[f32; 4],
146+
y4: &[f32; 4],
147+
fx1: &[f32; 4],
148+
fx2: &[f32; 4],
149+
fx3: &[f32; 4],
150+
fx4: &[f32; 4],
151+
fy1: &[f32; 4],
152+
fy2: &[f32; 4],
153+
fy3: &[f32; 4],
154+
fy4: &[f32; 4],
155+
real_zones: &[usize; 4],
156+
half: &f32,
157+
ptiny: &f32,
158+
iend: &usize,
159+
);
160+
}
161+
162+
#[allow(improper_ctypes)]
163+
unsafe extern "C" {
164+
//#[link_name = "llvm.nvvm.barrier0"]
165+
//fn syncthreads() -> ();
166+
//#[link_name = "llvm.amdgcn.workitem.size.x"]
167+
//fn block_dim_x() -> i32;
168+
#[link_name = "llvm.amdgcn.workgroup.id.x"]
169+
fn block_idx_x() -> i32;
170+
#[link_name = "llvm.amdgcn.workgroup.size.x"]
171+
fn grid_dim_x() -> i32;
172+
#[link_name = "llvm.amdgcn.workitem.id.x"]
173+
fn thread_idx_x() -> i32;
66174
}
67175

68176
#[cfg(not(target_os = "linux"))]
69177
#[unsafe(no_mangle)]
70178
#[inline(never)]
71179
#[rustc_offload_kernel]
72-
pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) {
73-
unsafe { (*x)[0] = 21.0 };
180+
pub extern "gpu-kernel" fn kernel_1(
181+
div: &mut [f32],
182+
x1: &[f32],
183+
x2: &[f32],
184+
x3: &[f32],
185+
x4: &[f32],
186+
y1: &[f32],
187+
y2: &[f32],
188+
y3: &[f32],
189+
y4: &[f32],
190+
fx1: &[f32],
191+
fx2: &[f32],
192+
fx3: &[f32],
193+
fx4: &[f32],
194+
fy1: &[f32],
195+
fy2: &[f32],
196+
fy3: &[f32],
197+
fy4: &[f32],
198+
real_zones: &[usize],
199+
half: &f32,
200+
ptiny: &f32,
201+
iend: &usize,
202+
) {
203+
let ii = unsafe { block_idx_x() + thread_idx_x() } as usize;
204+
//let ii = unsafe { block_dim_x() * block_idx_x() + thread_idx_x() } as usize;
205+
if ii < *iend {
206+
let i = real_zones[ii];
207+
208+
let xi: f32 = half * (x1[i] + x2[i] - x3[i] - x4[i]);
209+
let xj = half * (x2[i] + x3[i] - x4[i] - x1[i]);
210+
211+
let yi = half * (y1[i] + y2[i] - y3[i] - y4[i]);
212+
let yj = half * (y2[i] + y3[i] - y4[i] - y1[i]);
213+
214+
let fxi = half * (fx1[i] + fx2[i] - fx3[i] - fx4[i]);
215+
let fxj = half * (fx2[i] + fx3[i] - fx4[i] - fx1[i]);
216+
217+
let fyi = half * (fy1[i] + fy2[i] - fy3[i] - fy4[i]);
218+
let fyj = half * (fy2[i] + fy3[i] - fy4[i] - fy1[i]);
219+
220+
let rarea = 1.0 / (xi * yj - xj * yi + ptiny);
221+
222+
let dfxdx = rarea * (fxi * yj - fxj * yi);
223+
224+
let dfydy = rarea * (fyj * xi - fyi * xj);
225+
226+
let affine = (fy1[i] + fy2[i] + fy3[i] + fy4[i]) / (y1[i] + y2[i] + y3[i] + y4[i]);
227+
228+
div[i] = dfxdx + dfydy + affine;
229+
}
74230
}
75231
```
76232

0 commit comments

Comments
 (0)