@@ -4,6 +4,7 @@ This feature is work-in-progress, and not ready for usage. The instructions here
44We currently work on launching the following Rust kernel on the GPU. To follow along, copy it to a ` src/lib.rs ` file.
55
66``` rust
7+ #![feature(link_llvm_intrinsics)]
78#![feature(abi_gpu_kernel)]
89#![feature(rustc_attrs)]
910#![feature(core_intrinsics)]
@@ -39,9 +40,52 @@ fn main() {
3940 };
4041 libc :: printf (val , (* array_c )[0 ]);
4142 }
43+ let mut div = [1.0 , 2.0 , 3.0 , 4.0 ];
44+ let x1 = [1.0 , 2.0 , 3.0 , 4.0 ];
45+ let x2 = [1.0 , 2.0 , 3.0 , 4.0 ];
46+ let x3 = [1.0 , 2.0 , 3.0 , 4.0 ];
47+ let x4 = [1.0 , 2.0 , 3.0 , 4.0 ];
48+ let y1 = [1.0 , 2.0 , 3.0 , 4.0 ];
49+ let y2 = [1.0 , 2.0 , 3.0 , 4.0 ];
50+ let y3 = [1.0 , 2.0 , 3.0 , 4.0 ];
51+ let y4 = [1.0 , 2.0 , 3.0 , 4.0 ];
52+ let fx1 = [1.0 , 2.0 , 3.0 , 4.0 ];
53+ let fx2 = [1.0 , 2.0 , 3.0 , 4.0 ];
54+ let fx3 = [1.0 , 2.0 , 3.0 , 4.0 ];
55+ let fx4 = [1.0 , 2.0 , 3.0 , 4.0 ];
56+ let fy1 = [1.0 , 2.0 , 3.0 , 4.0 ];
57+ let fy2 = [1.0 , 2.0 , 3.0 , 4.0 ];
58+ let fy3 = [1.0 , 2.0 , 3.0 , 4.0 ];
59+ let fy4 = [1.0 , 2.0 , 3.0 , 4.0 ];
60+ let real_zones = [0 , 0 , 0 , 0 ];
61+ let half = 1.0 ;
62+ let ptiny = 0.001 ;
63+ let iend = 4 ;
4264
4365 unsafe {
44- kernel (array_c );
66+ kernel (
67+ & mut div ,
68+ & x1 ,
69+ & x2 ,
70+ & x3 ,
71+ & x4 ,
72+ & y1 ,
73+ & y2 ,
74+ & y3 ,
75+ & y4 ,
76+ & fx1 ,
77+ & fx2 ,
78+ & fx3 ,
79+ & fx4 ,
80+ & fy1 ,
81+ & fy2 ,
82+ & fy3 ,
83+ & fy4 ,
84+ & real_zones ,
85+ & half ,
86+ & ptiny ,
87+ & iend ,
88+ );
4589 }
4690 core :: hint :: black_box (& array_c );
4791 unsafe {
@@ -56,21 +100,133 @@ fn main() {
56100}
57101
58102#[inline(never)]
59- unsafe fn kernel (x : * mut [f64 ; 256 ]) {
60- core :: intrinsics :: offload (kernel_1 , (x ,))
103+ unsafe fn kernel (
104+ div : & mut [f32 ; 4 ],
105+ x1 : & [f32 ; 4 ],
106+ x2 : & [f32 ; 4 ],
107+ x3 : & [f32 ; 4 ],
108+ x4 : & [f32 ; 4 ],
109+ y1 : & [f32 ; 4 ],
110+ y2 : & [f32 ; 4 ],
111+ y3 : & [f32 ; 4 ],
112+ y4 : & [f32 ; 4 ],
113+ fx1 : & [f32 ; 4 ],
114+ fx2 : & [f32 ; 4 ],
115+ fx3 : & [f32 ; 4 ],
116+ fx4 : & [f32 ; 4 ],
117+ fy1 : & [f32 ; 4 ],
118+ fy2 : & [f32 ; 4 ],
119+ fy3 : & [f32 ; 4 ],
120+ fy4 : & [f32 ; 4 ],
121+ real_zones : & [usize ; 4 ],
122+ half : & f32 ,
123+ ptiny : & f32 ,
124+ iend : & usize ,
125+ ) {
126+ core :: intrinsics :: offload (
127+ kernel_1 ,
128+ (
129+ div , x1 , x2 , x3 , x4 , y1 , y2 , y3 , y4 , fx1 , fx2 , fx3 , fx4 , fy1 , fy2 , fy3 , fy4 ,
130+ real_zones , half , ptiny , iend ,
131+ ),
132+ )
61133}
62134
63135#[cfg(target_os = " linux" )]
64136unsafe extern " C" {
65- pub fn kernel_1 (array_b : * mut [f64 ; 256 ]);
137+ pub fn kernel_1 (
138+ div : & mut [f32 ; 4 ],
139+ x1 : & [f32 ; 4 ],
140+ x2 : & [f32 ; 4 ],
141+ x3 : & [f32 ; 4 ],
142+ x4 : & [f32 ; 4 ],
143+ y1 : & [f32 ; 4 ],
144+ y2 : & [f32 ; 4 ],
145+ y3 : & [f32 ; 4 ],
146+ y4 : & [f32 ; 4 ],
147+ fx1 : & [f32 ; 4 ],
148+ fx2 : & [f32 ; 4 ],
149+ fx3 : & [f32 ; 4 ],
150+ fx4 : & [f32 ; 4 ],
151+ fy1 : & [f32 ; 4 ],
152+ fy2 : & [f32 ; 4 ],
153+ fy3 : & [f32 ; 4 ],
154+ fy4 : & [f32 ; 4 ],
155+ real_zones : & [usize ; 4 ],
156+ half : & f32 ,
157+ ptiny : & f32 ,
158+ iend : & usize ,
159+ );
160+ }
161+
162+ #[allow(improper_ctypes)]
163+ unsafe extern " C" {
164+ // #[link_name = "llvm.nvvm.barrier0"]
165+ // fn syncthreads() -> ();
166+ // #[link_name = "llvm.amdgcn.workitem.size.x"]
167+ // fn block_dim_x() -> i32;
168+ #[link_name = " llvm.amdgcn.workgroup.id.x" ]
169+ fn block_idx_x () -> i32 ;
170+ #[link_name = " llvm.amdgcn.workgroup.size.x" ]
171+ fn grid_dim_x () -> i32 ;
172+ #[link_name = " llvm.amdgcn.workitem.id.x" ]
173+ fn thread_idx_x () -> i32 ;
66174}
67175
68176#[cfg(not(target_os = " linux" ))]
69177#[unsafe (no_mangle)]
70178#[inline(never)]
71179#[rustc_offload_kernel]
72- pub extern " gpu-kernel" fn kernel_1 (x : * mut [f64 ; 256 ]) {
73- unsafe { (* x )[0 ] = 21.0 };
180+ pub extern " gpu-kernel" fn kernel_1 (
181+ div : & mut [f32 ],
182+ x1 : & [f32 ],
183+ x2 : & [f32 ],
184+ x3 : & [f32 ],
185+ x4 : & [f32 ],
186+ y1 : & [f32 ],
187+ y2 : & [f32 ],
188+ y3 : & [f32 ],
189+ y4 : & [f32 ],
190+ fx1 : & [f32 ],
191+ fx2 : & [f32 ],
192+ fx3 : & [f32 ],
193+ fx4 : & [f32 ],
194+ fy1 : & [f32 ],
195+ fy2 : & [f32 ],
196+ fy3 : & [f32 ],
197+ fy4 : & [f32 ],
198+ real_zones : & [usize ],
199+ half : & f32 ,
200+ ptiny : & f32 ,
201+ iend : & usize ,
202+ ) {
203+ let ii = unsafe { block_idx_x () + thread_idx_x () } as usize ;
204+ // let ii = unsafe { block_dim_x() * block_idx_x() + thread_idx_x() } as usize;
205+ if ii < * iend {
206+ let i = real_zones [ii ];
207+
208+ let xi : f32 = half * (x1 [i ] + x2 [i ] - x3 [i ] - x4 [i ]);
209+ let xj = half * (x2 [i ] + x3 [i ] - x4 [i ] - x1 [i ]);
210+
211+ let yi = half * (y1 [i ] + y2 [i ] - y3 [i ] - y4 [i ]);
212+ let yj = half * (y2 [i ] + y3 [i ] - y4 [i ] - y1 [i ]);
213+
214+ let fxi = half * (fx1 [i ] + fx2 [i ] - fx3 [i ] - fx4 [i ]);
215+ let fxj = half * (fx2 [i ] + fx3 [i ] - fx4 [i ] - fx1 [i ]);
216+
217+ let fyi = half * (fy1 [i ] + fy2 [i ] - fy3 [i ] - fy4 [i ]);
218+ let fyj = half * (fy2 [i ] + fy3 [i ] - fy4 [i ] - fy1 [i ]);
219+
220+ let rarea = 1.0 / (xi * yj - xj * yi + ptiny );
221+
222+ let dfxdx = rarea * (fxi * yj - fxj * yi );
223+
224+ let dfydy = rarea * (fyj * xi - fyi * xj );
225+
226+ let affine = (fy1 [i ] + fy2 [i ] + fy3 [i ] + fy4 [i ]) / (y1 [i ] + y2 [i ] + y3 [i ] + y4 [i ]);
227+
228+ div [i ] = dfxdx + dfydy + affine ;
229+ }
74230}
75231```
76232
0 commit comments