Added text files; minor cleanup
[RBC.git] / kernels.cu
1 #ifndef KERNELS_CU
2 #define KERNELS_CU
3
4 #include<cuda.h>
5 #include "defs.h"
6 #include "kernels.h"
7 #include<stdio.h>
8
9 // This kernel does the same thing as nnKernel, except it only considers pairs as
10 // specified by the compPlan.
11 __global__ void planNNKernel(const matrix Q, const matrix X, real *dMins, int *dMinIDs, compPlan cP, intMatrix xmap, int *qIDs, int qStartPos ){
12 int qBlock = qStartPos + blockIdx.y * BLOCK_SIZE; //indexes Q
13 int xBlock; //indexes X;
14 int colBlock;
15 int offQ = threadIdx.y; //the offset of qPos in this block
16 int offX = threadIdx.x; //ditto for x
17 int i,j,k,l;
18
19
20 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
21 __shared__ int minPos[BLOCK_SIZE][BLOCK_SIZE];
22
23 __shared__ real Xb[BLOCK_SIZE][BLOCK_SIZE];
24 __shared__ real Qb[BLOCK_SIZE][BLOCK_SIZE];
25
26 __shared__ int g; //group of q
27 __shared__ int numGroups;
28 __shared__ int groupCount;
29 __shared__ int groupOff;
30
31 if(offX==0 && offQ==0){
32 g = cP.qToGroup[qBlock];
33 numGroups = cP.numGroups[g];
34 }
35 min[offQ][offX]=MAX_REAL;
36 __syncthreads();
37
38 //NOTE: might be better to save numGroup, groupIts in local reg.
39
40 for(i=0;i<numGroups;i++){ //iterate over groups of X
41 if(offX==0 && offQ==0){
42 groupCount = cP.groupCountX[IDX(g,i,cP.ld)];
43 groupOff = cP.groupOff[IDX(g,i,cP.ld)];
44 }
45
46 __syncthreads();
47
48 int groupIts = groupCount/BLOCK_SIZE + (groupCount%BLOCK_SIZE==0? 0 : 1);
49
50 xBlock=groupOff;
51 for(j=0;j<groupIts;j++){ //iterate over elements of group
52 xBlock=j*BLOCK_SIZE;
53
54 real ans=0;
55 for(k=0;k<X.pc/BLOCK_SIZE;k++){ // iterate over cols to compute the distances
56 colBlock = k*BLOCK_SIZE;
57
58 //Each thread loads one element of X and Q into memory.
59 //Note that the indexing is flipped to increase memory
60 //coalescing.
61
62 Xb[offX][offQ] = X.mat[IDX( xmap.mat[IDX( g, xBlock+offQ, xmap.ld)], colBlock+offX, X.ld)];
63 Qb[offX][offQ] = ( (qIDs[qBlock+offQ]==DUMMY_IDX) ? 0 : Q.mat[IDX(qIDs[qBlock+offQ],colBlock+offX,Q.ld)] );
64 __syncthreads();
65
66 for(l=0;l<BLOCK_SIZE;l++){
67 ans+=abs(Xb[l][offX]-Qb[l][offQ]);
68 }
69 __syncthreads();
70 }
71
72 //compare to previous min and store into shared mem if needed.
73 if(j*BLOCK_SIZE+offX<groupCount && ans<min[offQ][offX]){
74 min[offQ][offX]=ans;
75 minPos[offQ][offX]= xmap.mat[IDX( g, xBlock+offX, xmap.ld )];
76 }
77 __syncthreads();
78
79 }
80 }
81
82 //compare across threads
83 for(k=BLOCK_SIZE/2;k>0;k/=2){
84 if(offX<k){
85 if(min[offQ][offX+k]<min[offQ][offX]){
86 min[offQ][offX] = min[offQ][offX+k];
87 minPos[offQ][offX] = minPos[offQ][offX+k];
88 }
89 }
90 __syncthreads();
91 }
92
93 if(offX==0 && qIDs[qBlock+offQ]!=DUMMY_IDX){
94 dMins[qIDs[qBlock+offQ]] = min[offQ][0];
95 dMinIDs[qIDs[qBlock+offQ]] = minPos[offQ][0];
96 }
97 }
98
99
100
101 __global__ void pruneKernel(const matrix D, const real *radiiX, const real *radiiQ, charMatrix cM){
102 int offX = threadIdx.x;
103 int offQ = threadIdx.y;
104
105 int blockX = blockIdx.x * BLOCK_SIZE;
106 int blockQ = blockIdx.y * BLOCK_SIZE;
107
108 __shared__ real sD[BLOCK_SIZE][BLOCK_SIZE];
109 __shared__ real sRQ[BLOCK_SIZE];
110 __shared__ real sRX[BLOCK_SIZE];
111
112 sD[offQ][offX]=D.mat[IDX(blockQ+offQ,blockX+offX,D.ld)];
113
114 if(offQ==0)
115 sRX[offX]=radiiX[blockX+offX];
116 if(offX==0)
117 sRQ[offQ]=radiiQ[blockQ+offQ];
118
119 __syncthreads();
120
121 if(blockQ+offQ < D.r && blockX+offX < D.c){
122 cM.mat[IDX(blockQ+offQ,blockX+offX,cM.ld)] = (sD[offQ][offX]-sRX[offX]-2*sRQ[offQ] <= 0) ? 1 : 0;
123 //cM.mat[IDX(blockQ+offQ,blockX+offX,cM.ld)] = (sD[offQ][offX]-4*sRQ[offQ] <= 0) ? 1 : 0;
124 }
125 }
126
127
128 __global__ void nnKernel(const matrix Q, const matrix X, real *dMins, int *dMinIDs){
129
130 int qBlock = blockIdx.y * BLOCK_SIZE; //indexes Q
131 int xBlock; //indexes X;
132 int colBlock;
133 int offQ = threadIdx.y; //the offset of qPos in this block
134 int offX = threadIdx.x; //ditto for x
135 int i,j,k;
136
137 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
138 __shared__ int minPos[BLOCK_SIZE][BLOCK_SIZE];
139
140 __shared__ real Xb[BLOCK_SIZE][BLOCK_SIZE];
141 __shared__ real Qb[BLOCK_SIZE][BLOCK_SIZE];
142
143 min[offQ][offX]=MAX_REAL;
144 __syncthreads();
145
146 for(i=0;i<X.pr/BLOCK_SIZE;i++){
147 xBlock = i*BLOCK_SIZE;
148 real ans=0;
149 for(j=0;j<X.pc/BLOCK_SIZE;j++){
150 colBlock = j*BLOCK_SIZE;
151
152 //Each thread loads one element of X and Q into memory.
153 //Note that the indexing is flipped to increase memory
154 //coalescing.
155 Xb[offX][offQ]=X.mat[IDX(xBlock+offQ,colBlock+offX,X.ld)];
156 Qb[offX][offQ]=Q.mat[IDX(qBlock+offQ,colBlock+offX,Q.ld)];
157
158 __syncthreads();
159
160 for(k=0;k<BLOCK_SIZE;k++){
161 ans+=abs(Xb[k][offX]-Qb[k][offQ]);
162 }
163 __syncthreads();
164 }
165
166
167 if( xBlock+offX<X.r && ans<min[offQ][offX] ){
168 minPos[offQ][offX] = xBlock+offX;
169 min[offQ][offX] = ans;
170 }
171 }
172 __syncthreads();
173
174
175 //reduce across threads
176 for(j=BLOCK_SIZE/2;j>0;j/=2){
177 if(offX<j){
178 if(min[offQ][offX+j]<min[offQ][offX]){
179 min[offQ][offX] = min[offQ][offX+j];
180 minPos[offQ][offX] = minPos[offQ][offX+j];
181 }
182 }
183 __syncthreads();
184 }
185
186 if(offX==0){
187 //printf("writing %d, nn= %d, val = %6.4f \n",qBlock+offQ,curMinPos[offQ],curMin[offQ]);
188 dMins[qBlock+offQ] = min[offQ][0];
189 dMinIDs[qBlock+offQ] = minPos[offQ][0];
190 }
191 }
192
193
194 __device__ void dist1Kernel(const matrix Q, const matrix X, matrix D){
195 int c, i, j;
196
197 int qB = blockIdx.y*BLOCK_SIZE;
198 int q = threadIdx.y;
199 int xB = blockIdx.x*BLOCK_SIZE;
200 int x = threadIdx.x;
201
202 real ans=0;
203
204 //This thread is responsible for computing the dist between Q[qB+q] and X[xB+x]
205
206 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
207 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
208
209
210 for(i=0 ; i<Q.pc/BLOCK_SIZE ; i++){
211 c=i*BLOCK_SIZE; //current col block
212
213 Qs[x][q] = Q.mat[ IDX(qB+q, c+x, Q.ld) ];
214 Xs[x][q] = X.mat[ IDX(xB+q, c+x, X.ld) ];
215
216 __syncthreads();
217
218 for(j=0 ; j<BLOCK_SIZE ; j++)
219 ans += abs( Qs[j][q] - Xs[j][x] );
220
221 __syncthreads();
222 }
223
224 D.mat[ IDX( qB+q, xB+x, D.ld ) ] = ans;
225
226 }
227
228
229
230 __global__ void findRangeKernel(matrix D, real *ranges, int cntWant){
231
232 int row = blockIdx.y*(BLOCK_SIZE/4)+threadIdx.y;
233 int ro = threadIdx.y;
234 int co = threadIdx.x;
235 int i, c;
236 real t;
237
238 const int LB = (90*cntWant)/100 ;
239 const int UB = cntWant;
240
241 __shared__ real smin[BLOCK_SIZE/4][4*BLOCK_SIZE];
242 __shared__ real smax[BLOCK_SIZE/4][4*BLOCK_SIZE];
243
244 real min=MAX_REAL;
245 real max=0;
246 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
247 if( c+co < D.c ){
248 t = D.mat[ IDX( row, c+co, D.ld ) ];
249 min = MIN(t,min);
250 max = MAX(t,max);
251 }
252 }
253
254 smin[ro][co] = min;
255 smax[ro][co] = max;
256 __syncthreads();
257
258 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
259 if( co < i ){
260 smin[ro][co] = MIN( smin[ro][co], smin[ro][co+i] );
261 smax[ro][co] = MAX( smax[ro][co], smax[ro][co+i] );
262 }
263 __syncthreads();
264 }
265
266 //Now start range counting.
267
268 int itcount=0;
269 int cnt;
270 real rg;
271 __shared__ int scnt[BLOCK_SIZE/4][4*BLOCK_SIZE];
272 __shared__ char cont[BLOCK_SIZE/4];
273
274 if(co==0)
275 cont[ro]=1;
276
277 do{
278 itcount++;
279 __syncthreads();
280
281 if( cont[ro] ) //if we didn't actually need to cont, leave rg as it was.
282 rg = ( smax[ro][0] + smin[ro][0] ) / ((real)2.0) ;
283
284 cnt=0;
285 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
286 cnt += (c+co < D.c && row < D.r && D.mat[ IDX( row, c+co, D.ld ) ] <= rg);
287 }
288
289 scnt[ro][co] = cnt;
290 __syncthreads();
291
292 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
293 if( co < i ){
294 scnt[ro][co] += scnt[ro][co+i];
295 }
296 __syncthreads();
297 }
298
299 if(co==0){
300 if( scnt[ro][0] < cntWant )
301 smin[ro][0]=rg;
302 else
303 smax[ro][0]=rg;
304 }
305
306 // cont[ro] == this row needs to continue
307 if(co==0)
308 cont[ro] = row<D.r && ( scnt[ro][0] < LB || scnt[ro][0] > UB );
309 __syncthreads();
310
311 // Determine if *any* of the rows need to continue
312 for(i=BLOCK_SIZE/8 ; i>0 ; i/=2){
313 if( ro < i && co==0)
314 cont[ro] |= cont[ro+i];
315 __syncthreads();
316 }
317
318 } while(cont[0]);
319
320 if(co==0 && row<D.r )
321 ranges[row]=rg;
322
323 }
324
325
326 __global__ void rangeSearchKernel(matrix D, real *ranges, charMatrix ir){
327 int col = blockIdx.x*BLOCK_SIZE + threadIdx.x;
328 int row = blockIdx.y*BLOCK_SIZE + threadIdx.y;
329
330 ir.mat[IDX( row, col, ir.ld )] = D.mat[IDX( row, col, D.ld )] < ranges[row];
331
332 }
333
334
335 __global__ void rangeCountKernel(const matrix Q, const matrix X, real *ranges, int *counts){
336 int q = blockIdx.y*BLOCK_SIZE;
337 int qo = threadIdx.y;
338 int xo = threadIdx.x;
339
340 real rg = ranges[q+qo];
341
342 int r,c,i;
343
344 __shared__ int scnt[BLOCK_SIZE][BLOCK_SIZE];
345
346 __shared__ real xs[BLOCK_SIZE][BLOCK_SIZE];
347 __shared__ real qs[BLOCK_SIZE][BLOCK_SIZE];
348
349 int cnt=0;
350 for( r=0; r<X.pr; r+=BLOCK_SIZE ){
351
352 real dist=0;
353 for( c=0; c<X.pc; c+=BLOCK_SIZE){
354 xs[xo][qo] = X.mat[IDX( r+qo, c+xo, X.ld )];
355 qs[xo][qo] = Q.mat[IDX( q+qo, c+xo, Q.ld )];
356 __syncthreads();
357
358 for( i=0; i<BLOCK_SIZE; i++)
359 dist += abs(xs[i][xo]-qs[i][qo]);
360 __syncthreads();
361
362 }
363 cnt += r+xo<X.r && dist<rg;
364
365 }
366
367 scnt[qo][xo]=cnt;
368 __syncthreads();
369
370 for( i=BLOCK_SIZE/2; i>0; i/=2 ){
371 if( xo<i ){
372 scnt[qo][xo] += scnt[qo][xo+i];
373 }
374 __syncthreads();
375 }
376
377 if( xo==0 && q+qo<Q.r )
378 counts[q+qo] = scnt[qo][0];
379 }
380
381
382 #endif