matrix - Why do I get "Unspecified Launch failure" in CUDA program, multiplying 2 matrices -
i new cuda. when multiply 1024x1024 matrix, , launch kernel with:
multiplykernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
but when multiply 2048 x 2048 matrix, dim3(64,64,1) error:
cudadevicesynchronize returned error code 4 after launching addkernel! unspecified launch failure
from tinkering code, think error in statement
result += a[row * size + ind] * b[col + size * ind];
in part
b[col+size*ind]
if take out, don't kernel launch error (just wrong answer, obviously). cannot figure out what's wrong. suggestions appreciated. using visual studio 2013. using debugger, not me find error.
this seems similar problem: cudadevicesynchronize returned error code 4 after launching
many thanks, here code:
cudaerror_t multiplywithcuda(int *c, const int *a, const int *b, unsigned int size); __global__ void multiplykernel(int *c, const int *a, const int *b, unsigned int size) { int row = blockidx.y * blockdim.y + threadidx.y; int col = blockidx.x * blockdim.x + threadidx.x; if (row > size || col > size) return; // target field in 1-d int z = row * size + col; int result = 0; (int ind = 0; ind < size ; ++ind) { result += a[row * size + ind] * b[col + size * ind]; } c[z] = result; } int main(){ const int sizematrix = 2048; int* = new int[sizematrix * sizematrix]; int* b = new int[sizematrix * sizematrix]; int* c = new int[sizematrix * sizematrix]; (int = 0; < sizematrix * sizematrix; i++) { a[i] = rand() % 2; b[i] = rand() % 2; } cudaerror_t cudastatus = multiplywithcuda(c, a, b, sizematrix); if (cudastatus != cudasuccess) { fprintf(stderr, "addwithcuda failed!"); return 1; } cudastatus = cudadevicereset(); if (cudastatus != cudasuccess) { fprintf(stderr, "cudadevicereset failed!"); return 1; } return 0; } cudaerror_t multiplywithcuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a ; int *dev_b; int *dev_c; cudaerror_t cudastatus; // choose gpu run on, change on multi-gpu system. cudastatus = cudasetdevice(0); fprintf(stdout, "device set"); if (cudastatus != cudasuccess) { fprintf(stderr, "cudasetdevice failed! have cuda-capable gpu installed?"); goto error; } // allocate gpu buffers 3 vectors (two input, 1 output) . cudastatus = cudamalloc((void**)&dev_c, size * size * sizeof(int)); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamalloc failed!"); goto error; } fprintf(stdout, "buffer c allocated \n"); cudastatus = cudamalloc((void**)&dev_a, size * size * sizeof(int)); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamalloc failed!"); goto error; } fprintf(stdout, "buffer allocated \n"); cudastatus = cudamalloc((void**)&dev_b, size * size * sizeof(int)); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamalloc failed!"); goto error; } fprintf(stdout, "buffer b allocated \n"); // copy input vectors host memory gpu buffers. cudastatus = cudamemcpy(dev_a, a, size * size * sizeof(int), cudamemcpyhosttodevice); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamemcpy failed!"); goto error; } fprintf(stdout, "cudamemcpy done \n"); cudastatus = cudamemcpy(dev_b, b, size * size * sizeof(int), cudamemcpyhosttodevice); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamemcpy failed!"); goto error; } fprintf(stdout, "cudamemcpy b done\n"); fprintf(stdout, "about launch kernel \n"); // launch kernel on gpu 1 thread each element. multiplykernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size); fprintf(stdout, "kernel launched\n"); // check errors launching kernel cudastatus = cudagetlasterror(); if (cudastatus != cudasuccess) { ; fprintf(stderr, "addkernel launch failed: %s\n", cudageterrorstring(cudastatus)); goto error; } // cudadevicesynchronize waits kernel finish, , returns // errors encountered during launch. cudastatus = cudadevicesynchronize(); if (cudastatus != cudasuccess) { fprintf(stderr, "cudadevicesynchronize returned error code %d after launching addkernel!\n", cudastatus); fprintf(stderr, " %s\n", cudageterrorstring(cudastatus)); goto error; } // copy output vector gpu buffer host memory. cudastatus = cudamemcpy(c, dev_c, size * size * sizeof(int), cudamemcpydevicetohost); if (cudastatus != cudasuccess) { fprintf(stderr, "cudamemcpy failed!"); goto error; } error: cudafree(dev_c); cudafree(dev_a); cudafree(dev_b); return cudastatus; }
on windows, right clicked nsight monitor icon in system tray. there chose options>general. see wddm tdr delay. @ 2, , increased 10. then, ran program again, , worked fine. according robert's link (see above) http://http.developer.nvidia.com/nsightvisualstudio/2.2/documentation/userguide/html/content/timeout_detection_recovery.htm
Comments
Post a Comment