parallelizing nested for loop with cuda have big limit -
i new cuda. i'm trying write cuda kernel perform following piece of code.
for(int oz=0;oz<count1;oz++) { for(int ox=0;ox<scale+1;ox++) { for(int xhn=0;xhn<wjh;xhn++) { for(int yhn=0;yhn<wjv;yhn++) { //int numx=xhn+ox*wjh; int numx=oz*(scale+1)*wjh+ox*wjh+xhn; int src2=yhn+xhn*wjv; ic_real[src2]=ic_real[src2]+sr[oz*(scale+1)*wjv+ox*wjv+yhn]*hr_table[numx]-si[oz*(scale+1)*wjv+ox*wjv+yhn]*hi_table[numx]; ic_img[src2]=ic_img[src2]+sr[oz*(scale+1)*wjv+ox*wjv+yhn]*hi_table[numx]+si[oz*(scale+1)*wjv+ox*wjv+yhn]*hr_table[numx]; } } } }
the value wjh=1080,wjv=1920,scale=255;oz>=4.this have currently,but code can perform when count1<=4, if oz>4 ,it doesn't work,does know should ? cheers
__global__ void lut_kernel(float *sr,float *si,dim3 size,int wjh,int wjv,float *vr,float *vi, float *hr,float *hi,float *ic_re,float *ic_im) { __shared__ float cachere[threadperblock]; __shared__ float cacheim[threadperblock]; int blockid=blockidx.x + blockidx.y * griddim.x; int cacheindex=threadidx.y*blockdim.x+threadidx.x; int z=threadidx.x; int x=threadidx.y; int tid1=threadidx.y*blockdim.x+threadidx.x; //int tid= blockid * (blockdim.x * blockdim.y) // + (threadidx.y * blockdim.x) + threadidx.x; int countnum=0; float re=0.0f; float im=0.0f; float re_value=0.0f; float im_value=0.0f; if (z<4 && x<256) { int src2=z*(scale+1)*wjh+x*wjh+blockidx.y; re=sr[z*(scale+1)*wjv+x*wjv+blockidx.x]*hr[src2]-si[z*(scale+1)*wjv+x*wjv+blockidx.x]*hi[src2]; im=sr[z*(scale+1)*wjv+x*wjv+blockidx.x]*hi[src2]+si[z*(scale+1)*wjv+x*wjv+blockidx.x]*hr[src2]; } cachere[cacheindex]=re; cacheim[cacheindex]=im; __syncthreads(); int index=threadperblock/2; while(index!=0) { if(cacheindex<index) { cachere[cacheindex]+=cachere[cacheindex+index]; cacheim[cacheindex]+=cacheim[cacheindex+index]; } index/=2; } if(cacheindex==0) { ic_re[blockid]=cachere[0]; ic_im[blockid]=cacheim[0]; //printf("ic= %d,blockid= %d\n",ic_re[blockid],blockid); } }
the kernel parameter is: dim3 dimblock(count1,256); dim3 dimgrid(wjv,wjh);
lut_kernel<<<dimgrid,dimblock>>>(d_sr,d_si,size,wjh,wjv,dvr_table,dvi_table,dhr_table,dhi_table,dic_re,dic_im);
if count1>4,what shuold parallelize nested code?
i checked code briefly , seems computation of ic_img , ic_real elements easy parallelize (count1, scale+1, wjh, wjv have no dependency @ among each other). thus, there's no need have shared variables , while loops in kernel; it's easy implement below, parameter int numelements = count1 *(scale+1) * wjh * wjv.
int = blockdim.x * blockidx.x + threadidx.x; if (i < numelements) { //.... }
the code easier maintain , eliminate bugs prone long codes example. if src2 values not repeat @ in innermost loop, performance close optimal well. if 'src2' may repeat, use expression 'atomicadd' results correct expected; atomicadd, performance may not optimal, @ least 1 correctly implemented bug free kernel implemented. if causes performance bottleneck, modify trying , experimenting different implementations.
Comments
Post a Comment