CUDA atomic function usage with volatile shared memory -
i have cuda kernel needs use atomic function on volatile shared integer memory. however, when try declare shared memory volatile , use in atomic function, error message.
below minimalist code reproduces error. please note following kernel nothing , horribly abuses why ever want declare shared memory volatile (or use shared memory @ all). does reproduce error.
the code uses atomic functions on shared memory, so, run it, need compile "arch12" or higher (in visual studio 2010, right click on project , go "properties -> configuration properties -> cuda c/c++ -> device" , enter "compute_12,sm_12" in "code generation" line). code should otherwise compile is.
#include <cstdlib> #include <cuda_runtime.h> static int const x_thrds_per_blk = 32; static int const y_thrds_per_blk = 8; __global__ void kernelwithsharedmemoryandatomicfunction(int * d_array, int numtotx, int numtoty) { __shared__ int s_blk[y_thrds_per_blk][x_thrds_per_blk]; // compiles //volatile __shared__ int s_blk[y_thrds_per_blk][x_thrds_per_blk]; // not compile int tx = threadidx.x; int ty = threadidx.y; int mx = blockidx.x*blockdim.x + threadidx.x; int = blockidx.y*blockdim.y + threadidx.y; int mi = my*numtotx + mx; if (mx < numtotx && < numtoty) { s_blk[ty][tx] = d_array[mi]; __syncthreads(); atomicmin(&s_blk[ty][tx], 4); // compile volatile shared memory if line commented out __syncthreads(); d_array[mi] = s_blk[ty][tx]; } } int main(void) { // declare , initialize array on host int const num_tot_x = 4*x_thrds_per_blk; int const num_tot_y = 6*y_thrds_per_blk; int * h_array = (int *)malloc(num_tot_x*num_tot_y*sizeof(int)); (int = 0; < num_tot_x*num_tot_y; ++i) h_array[i] = i; // copy array device int * d_array; cudamalloc((void **)&d_array, num_tot_x*num_tot_y*sizeof(int)); cudamemcpy(d_array, h_array, num_tot_x*num_tot_y*sizeof(int), cudamemcpyhosttodevice); // declare block , thread variables dim3 thdsperblk; dim3 blks; thdsperblk.x = x_thrds_per_blk; thdsperblk.y = y_thrds_per_blk; thdsperblk.z = 1; blks.x = (num_tot_x + x_thrds_per_blk - 1)/x_thrds_per_blk; blks.y = (num_tot_y + y_thrds_per_blk - 1)/y_thrds_per_blk; blks.z = 1; // run kernel kernelwithsharedmemoryandatomicfunction<<<blks, thdsperblk>>>(d_array, num_tot_x, num_tot_y); // cleanup free (h_array); cudafree(d_array); return 0; }
anyway, if comment out "s_blk" declaration towards top of kernel , uncomment commented-out declaration following it, should following error:
error : no instance of overloaded function "atomicmin" matches argument list
i not understand why declaring shared memory volatile affect type, (i think) error message indicating, nor why cannot used atomic operations.
can please provide insight?
thanks,
aaron
just replace
atomicmin(&s_blk[ty][tx], 4);
atomicmin((int *)&s_blk[ty][tx], 4);
.
it typecasts &s_blk[ty][tx]
matches argument of atomicmin(..)
.
Comments
Post a Comment