#include #include #include using namespace std; #define CUDA_CHECK(value) { \ cudaError_t cu_err = value; \ if (cu_err != cudaSuccess) { \ cerr << "Error " << cudaGetErrorString(cu_err) << " at line "; \ cerr << __LINE__ << " in file " << __FILE__<< endl; \ exit(1); \ } \ } double xmin = -2; double xmax = 1; int xstep = 256; double ymin = -1; double ymax = 1; int ystep = 256; const int iters = 100000; // ----------------------------------------------------- // function executed on device // ----------------------------------------------------- __device__ int escapes(double cr, double ci, int it) { double zr = 0; double zi = 0; double zrtmp; int i; for(i=0; i 4) { return 1; } } return 0; } // ----------------------------------------------------- // non optimized kernel for grid / block // ----------------------------------------------------- __global__ void kernel_1(char *m, const double ymin, const double ymax, const double xmin, const double xmax, const int ystep, const int xstep) { int xc = blockIdx.x; int yc = blockIdx.y; double y = yc*(ymax-ymin) / ystep + ymin; double x = xc*(xmax-xmin) / xstep + xmin; if (escapes(x, y, iters)) { m[yc*xstep+xc] = ' '; } else { m[yc*xstep+xc] = 'X'; } } // ----------------------------------------------------- // optimized kernel for grid / block // ----------------------------------------------------- __global__ void kernel_2(char *m, const double ymin, const double ymax, const double xmin, const double xmax, const int ystep, const int xstep) { int xc = blockIdx.x * blockDim.x + threadIdx.x; int yc = blockIdx.y * blockDim.y + threadIdx.y; int offset = xc + yc * gridDim.x * blockDim.x; double y = yc*(ymax-ymin) / ystep + ymin; double x = xc*(xmax-xmin) / xstep + xmin; if (escapes(x, y, iters)) { m[offset] = ' '; } else { m[offset] = 'X'; } } int kernel = 1; // ----------------------------------------------------- // main function // ----------------------------------------------------- int main(int argc, char *argv[]) { size_t size = 256 * 256; size_t size_in_bytes = size * sizeof(char); // allocate matrix on CPU as AD-array char *h_m = new char [size]; if (argc > 1) { kernel = atoi(argv[1]); } // allocate matrix on GPU as 1D-array char *d_m; CUDA_CHECK(cudaMalloc(&d_m, size_in_bytes)); if (kernel == 1) { dim3 grid(xstep, ystep); dim3 block(1); kernel_1<<>>(d_m, ymin, ymax, xmin, xmax, ystep, xstep); } else { dim3 grid(xstep/16, ystep/16); dim3 block(16,16); kernel_2<<>>(d_m, ymin, ymax, xmin, xmax, ystep, xstep); } cudaError_t _m_cudaStat = cudaGetLastError(); if (_m_cudaStat != cudaSuccess) { cerr << "error " << cudaGetLastError() << endl; } // copy result from GPU to CPU CUDA_CHECK(cudaMemcpy(h_m, d_m, size_in_bytes, cudaMemcpyDeviceToHost)); for (int i=0; i