Modified the scan kernel so that it can work with > 1024*1024 numbers
[RBC.git] / kernels.cu
1 #ifndef KERNELS_CU
2 #define KERNELS_CU
3
4 #include<cuda.h>
5 #include "defs.h"
6 #include "kernels.h"
7 #include<stdio.h>
8
9 // This kernel does the same thing as nnKernel, except it only considers pairs as
10 // specified by the compPlan.
11 __global__ void planNNKernel(const matrix Q, const matrix X, real *dMins, int *dMinIDs, compPlan cP, intMatrix xmap, int *qIDs, int qStartPos ){
12 int qBlock = qStartPos + blockIdx.y * BLOCK_SIZE; //indexes Q
13 int xBlock; //indexes X;
14 int colBlock;
15 int offQ = threadIdx.y; //the offset of qPos in this block
16 int offX = threadIdx.x; //ditto for x
17 int i,j,k,l;
18
19
20 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
21 __shared__ int minPos[BLOCK_SIZE][BLOCK_SIZE];
22
23 __shared__ real Xb[BLOCK_SIZE][BLOCK_SIZE];
24 __shared__ real Qb[BLOCK_SIZE][BLOCK_SIZE];
25
26 __shared__ int g; //group of q
27 __shared__ int numGroups;
28 __shared__ int groupCount;
29 __shared__ int groupOff;
30
31 if(offX==0 && offQ==0){
32 g = cP.qToGroup[qBlock];
33 numGroups = cP.numGroups[g];
34 }
35 min[offQ][offX]=MAX_REAL;
36 __syncthreads();
37
38 //NOTE: might be better to save numGroup, groupIts in local reg.
39
40 for(i=0;i<numGroups;i++){ //iterate over groups of X
41 if(offX==0 && offQ==0){
42 groupCount = cP.groupCountX[IDX(g,i,cP.ld)];
43 groupOff = cP.groupOff[IDX(g,i,cP.ld)];
44 }
45 /* if(qBlock==0 && offQ==0) */
46 /* printf("g=%d groupCount=%d \n",g,groupCount); */
47 __syncthreads();
48
49 int groupIts = groupCount/BLOCK_SIZE + (groupCount%BLOCK_SIZE==0? 0 : 1);
50
51 xBlock=groupOff;
52 for(j=0;j<groupIts;j++){ //iterate over elements of group
53 xBlock=j*BLOCK_SIZE;
54
55 real ans=0;
56 for(k=0;k<X.pc/BLOCK_SIZE;k++){ // iterate over cols to compute the distances
57 colBlock = k*BLOCK_SIZE;
58
59 //Each thread loads one element of X and Q into memory.
60 //Note that the indexing is flipped to increase memory
61 //coalescing.
62
63 Xb[offX][offQ] = X.mat[IDX( xmap.mat[IDX( g, xBlock+offQ, xmap.ld)], colBlock+offX, X.ld)];
64 Qb[offX][offQ] = ( (qIDs[qBlock+offQ]==DUMMY_IDX) ? 0 : Q.mat[IDX(qIDs[qBlock+offQ],colBlock+offX,Q.ld)] );
65 __syncthreads();
66
67 for(l=0;l<BLOCK_SIZE;l++){
68 ans+=abs(Xb[l][offX]-Qb[l][offQ]);
69 }
70 __syncthreads();
71 }
72
73 //compare to previous min and store into shared mem if needed.
74 if(j*BLOCK_SIZE+offX<groupCount && ans<min[offQ][offX]){
75 min[offQ][offX]=ans;
76 minPos[offQ][offX]= xmap.mat[IDX( g, xBlock+offX, xmap.ld )];
77 }
78 __syncthreads();
79
80 }
81 }
82
83 //compare across threads
84 for(k=BLOCK_SIZE/2;k>0;k/=2){
85 if(offX<k){
86 if(min[offQ][offX+k]<min[offQ][offX]){
87 min[offQ][offX] = min[offQ][offX+k];
88 minPos[offQ][offX] = minPos[offQ][offX+k];
89 }
90 }
91 __syncthreads();
92 }
93
94 if(offX==0 && qIDs[qBlock+offQ]!=DUMMY_IDX){
95 dMins[qIDs[qBlock+offQ]] = min[offQ][0];
96 dMinIDs[qIDs[qBlock+offQ]] = minPos[offQ][0];
97 }
98 }
99
100
101
102 __global__ void pruneKernel(const matrix D, const real *radiiX, const real *radiiQ, charMatrix cM){
103 int offX = threadIdx.x;
104 int offQ = threadIdx.y;
105
106 int blockX = blockIdx.x * BLOCK_SIZE;
107 int blockQ = blockIdx.y * BLOCK_SIZE;
108
109 __shared__ real sD[BLOCK_SIZE][BLOCK_SIZE];
110 __shared__ real sRQ[BLOCK_SIZE];
111 __shared__ real sRX[BLOCK_SIZE];
112
113 sD[offQ][offX]=D.mat[IDX(blockQ+offQ,blockX+offX,D.ld)];
114
115 if(offQ==0)
116 sRX[offX]=radiiX[blockX+offX];
117 if(offX==0)
118 sRQ[offQ]=radiiQ[blockQ+offQ];
119
120 __syncthreads();
121
122 if(blockQ+offQ < D.r && blockX+offX < D.c){
123 cM.mat[IDX(blockQ+offQ,blockX+offX,cM.ld)] = (sD[offQ][offX]-sRX[offX]-2*sRQ[offQ] <= 0) ? 1 : 0;
124 //cM.mat[IDX(blockQ+offQ,blockX+offX,cM.ld)] = (sD[offQ][offX]-4*sRQ[offQ] <= 0) ? 1 : 0;
125 }
126 }
127
128
129 __global__ void nnKernel(const matrix Q, const matrix X, real *dMins, int *dMinIDs){
130
131 int qBlock = blockIdx.y * BLOCK_SIZE; //indexes Q
132 int xBlock; //indexes X;
133 int colBlock;
134 int offQ = threadIdx.y; //the offset of qPos in this block
135 int offX = threadIdx.x; //ditto for x
136 int i,j,k;
137
138 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
139 __shared__ int minPos[BLOCK_SIZE][BLOCK_SIZE];
140
141 __shared__ real Xb[BLOCK_SIZE][BLOCK_SIZE];
142 __shared__ real Qb[BLOCK_SIZE][BLOCK_SIZE];
143
144 min[offQ][offX]=MAX_REAL;
145 __syncthreads();
146
147 for(i=0;i<X.pr/BLOCK_SIZE;i++){
148 xBlock = i*BLOCK_SIZE;
149 real ans=0;
150 for(j=0;j<X.pc/BLOCK_SIZE;j++){
151 colBlock = j*BLOCK_SIZE;
152
153 //Each thread loads one element of X and Q into memory.
154 //Note that the indexing is flipped to increase memory
155 //coalescing.
156 Xb[offX][offQ]=X.mat[IDX(xBlock+offQ,colBlock+offX,X.ld)];
157 Qb[offX][offQ]=Q.mat[IDX(qBlock+offQ,colBlock+offX,Q.ld)];
158
159 __syncthreads();
160
161 for(k=0;k<BLOCK_SIZE;k++){
162 ans+=abs(Xb[k][offX]-Qb[k][offQ]);
163 }
164 __syncthreads();
165 }
166
167
168 if( xBlock+offX<X.r && ans<min[offQ][offX] ){
169 minPos[offQ][offX] = xBlock+offX;
170 min[offQ][offX] = ans;
171 }
172 }
173 __syncthreads();
174
175
176 //reduce across threads
177 for(j=BLOCK_SIZE/2;j>0;j/=2){
178 if(offX<j){
179 if(min[offQ][offX+j]<min[offQ][offX]){
180 min[offQ][offX] = min[offQ][offX+j];
181 minPos[offQ][offX] = minPos[offQ][offX+j];
182 }
183 }
184 __syncthreads();
185 }
186
187 if(offX==0){
188 //printf("writing %d, nn= %d, val = %6.4f \n",qBlock+offQ,curMinPos[offQ],curMin[offQ]);
189 dMins[qBlock+offQ] = min[offQ][0];
190 dMinIDs[qBlock+offQ] = minPos[offQ][0];
191 }
192 }
193
194
195 __device__ void dist1Kernel(const matrix Q, int qStart, const matrix X, int xStart, matrix D){
196 int c, i, j;
197
198 int qB = blockIdx.y*BLOCK_SIZE + qStart;
199 int q = threadIdx.y;
200 int xB = blockIdx.x*BLOCK_SIZE + xStart;
201 int x = threadIdx.x;
202
203 real ans=0;
204
205 //This thread is responsible for computing the dist between Q[qB+q] and X[xB+x]
206
207 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
208 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
209
210
211 for(i=0 ; i<Q.pc/BLOCK_SIZE ; i++){
212 c=i*BLOCK_SIZE; //current col block
213
214 Qs[x][q] = Q.mat[ IDX(qB+q, c+x, Q.ld) ];
215 Xs[x][q] = X.mat[ IDX(xB+q, c+x, X.ld) ];
216
217 __syncthreads();
218
219 for(j=0 ; j<BLOCK_SIZE ; j++)
220 ans += abs( Qs[j][q] - Xs[j][x] );
221
222 __syncthreads();
223 }
224
225 D.mat[ IDX( qB+q, xB+x, D.ld ) ] = ans;
226
227 }
228
229
230
231 __global__ void findRangeKernel(matrix D, real *ranges, int cntWant){
232
233 int row = blockIdx.y*(BLOCK_SIZE/4)+threadIdx.y;
234 int ro = threadIdx.y;
235 int co = threadIdx.x;
236 int i, c;
237 real t;
238
239 const int LB = (90*cntWant)/100 ;
240 const int UB = cntWant;
241
242 __shared__ real smin[BLOCK_SIZE/4][4*BLOCK_SIZE];
243 __shared__ real smax[BLOCK_SIZE/4][4*BLOCK_SIZE];
244
245 real min=MAX_REAL;
246 real max=0;
247 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
248 if( c+co < D.c ){
249 t = D.mat[ IDX( row, c+co, D.ld ) ];
250 min = MIN(t,min);
251 max = MAX(t,max);
252 }
253 }
254
255 smin[ro][co] = min;
256 smax[ro][co] = max;
257 __syncthreads();
258
259 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
260 if( co < i ){
261 smin[ro][co] = MIN( smin[ro][co], smin[ro][co+i] );
262 smax[ro][co] = MAX( smax[ro][co], smax[ro][co+i] );
263 }
264 __syncthreads();
265 }
266
267 //Now start range counting.
268
269 int itcount=0;
270 int cnt;
271 real rg;
272 __shared__ int scnt[BLOCK_SIZE/4][4*BLOCK_SIZE];
273 __shared__ char cont[BLOCK_SIZE/4];
274
275 if(co==0)
276 cont[ro]=1;
277
278 do{
279 itcount++;
280 __syncthreads();
281
282 if( cont[ro] ) //if we didn't actually need to cont, leave rg as it was.
283 rg = ( smax[ro][0] + smin[ro][0] ) / ((real)2.0) ;
284
285 cnt=0;
286 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
287 cnt += (c+co < D.c && row < D.r && D.mat[ IDX( row, c+co, D.ld ) ] <= rg);
288 }
289
290 scnt[ro][co] = cnt;
291 __syncthreads();
292
293 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
294 if( co < i ){
295 scnt[ro][co] += scnt[ro][co+i];
296 }
297 __syncthreads();
298 }
299
300 if(co==0){
301 if( scnt[ro][0] < cntWant )
302 smin[ro][0]=rg;
303 else
304 smax[ro][0]=rg;
305 }
306
307 // cont[ro] == this row needs to continue
308 if(co==0)
309 cont[ro] = row<D.r && ( scnt[ro][0] < LB || scnt[ro][0] > UB );
310 __syncthreads();
311
312 // Determine if *any* of the rows need to continue
313 for(i=BLOCK_SIZE/8 ; i>0 ; i/=2){
314 if( ro < i && co==0)
315 cont[ro] |= cont[ro+i];
316 __syncthreads();
317 }
318
319 } while(cont[0]);
320
321 if(co==0 && row<D.r )
322 ranges[row]=rg;
323
324 }
325
326
327 __global__ void rangeSearchKernel(matrix D, int xOff, int yOff, real *ranges, charMatrix ir){
328 int col = blockIdx.x*BLOCK_SIZE + threadIdx.x + xOff;
329 int row = blockIdx.y*BLOCK_SIZE + threadIdx.y + yOff;
330
331 ir.mat[IDX( row, col, ir.ld )] = D.mat[IDX( row, col, D.ld )] < ranges[row];
332
333 }
334
335
336 __global__ void rangeCountKernel(const matrix Q, const matrix X, real *ranges, int *counts){
337 int q = blockIdx.y*BLOCK_SIZE;
338 int qo = threadIdx.y;
339 int xo = threadIdx.x;
340
341 real rg = ranges[q+qo];
342
343 int r,c,i;
344
345 __shared__ int scnt[BLOCK_SIZE][BLOCK_SIZE];
346
347 __shared__ real xs[BLOCK_SIZE][BLOCK_SIZE];
348 __shared__ real qs[BLOCK_SIZE][BLOCK_SIZE];
349
350 int cnt=0;
351 for( r=0; r<X.pr; r+=BLOCK_SIZE ){
352
353 real dist=0;
354 for( c=0; c<X.pc; c+=BLOCK_SIZE){
355 xs[xo][qo] = X.mat[IDX( r+qo, c+xo, X.ld )];
356 qs[xo][qo] = Q.mat[IDX( q+qo, c+xo, Q.ld )];
357 __syncthreads();
358
359 for( i=0; i<BLOCK_SIZE; i++)
360 dist += abs(xs[i][xo]-qs[i][qo]);
361 __syncthreads();
362
363 }
364 cnt += r+xo<X.r && dist<rg;
365
366 }
367
368 scnt[qo][xo]=cnt;
369 __syncthreads();
370
371 for( i=BLOCK_SIZE/2; i>0; i/=2 ){
372 if( xo<i ){
373 scnt[qo][xo] += scnt[qo][xo+i];
374 }
375 __syncthreads();
376 }
377
378 if( xo==0 && q+qo<Q.r )
379 counts[q+qo] = scnt[qo][0];
380 }
381
382
383 #endif