added offset sorting routine
[RBC.git] / kernels.cu
1 /* This file is part of the Random Ball Cover (RBC) library.
2 * (C) Copyright 2010, Lawrence Cayton [lcayton@tuebingen.mpg.de]
3 */
4
5 #ifndef KERNELS_CU
6 #define KERNELS_CU
7
8 #include<cuda.h>
9 #include "defs.h"
10 #include "kernels.h"
11 #include<stdio.h>
12
13 // This kernel does the same thing as nnKernel, except it only considers pairs as
14 // specified by the compPlan.
15 __global__ void planNNKernel(const matrix Q, const unint *qMap, const matrix X, const intMatrix xMap, real *dMins, unint *dMinIDs, compPlan cP, unint qStartPos ){
16 unint qB = qStartPos + blockIdx.y * BLOCK_SIZE; //indexes Q
17 unint xB; //X (DB) Block;
18 unint cB; //column Block
19 unint offQ = threadIdx.y; //the offset of qPos in this block
20 unint offX = threadIdx.x; //ditto for x
21 unint i,j,k;
22 unint groupIts;
23
24 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
25 __shared__ unint minPos[BLOCK_SIZE][BLOCK_SIZE];
26
27 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
28 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
29
30 unint g; //query group of q
31 unint xG; //DB group currently being examined
32 unint numGroups;
33 unint groupCount;
34
35 g = cP.qToQGroup[qB];
36 numGroups = cP.numGroups[g];
37
38 min[offQ][offX]=MAX_REAL;
39 __syncthreads();
40
41
42 for(i=0; i<numGroups; i++){ //iterate over DB groups
43 xG = cP.qGroupToXGroup[IDX( g, i, cP.ld )];
44 groupCount = cP.groupCountX[IDX( g, i, cP.ld )];
45 groupIts = (groupCount+BLOCK_SIZE-1)/BLOCK_SIZE;
46
47 for(j=0; j<groupIts; j++){ //iterate over elements of group
48 xB=j*BLOCK_SIZE;
49
50 real ans=0;
51 for(cB=0; cB<X.pc; cB+=BLOCK_SIZE){ // iterate over cols to compute distances
52
53 Xs[offX][offQ] = X.mat[IDX( xMap.mat[IDX( xG, xB+offQ, xMap.ld )], cB+offX, X.ld )];
54 Qs[offX][offQ] = ( (qMap[qB+offQ]==DUMMY_IDX) ? 0 : Q.mat[IDX( qMap[qB+offQ], cB+offX, Q.ld )] );
55 __syncthreads();
56
57 for(k=0; k<BLOCK_SIZE; k++)
58 ans+=DIST( Xs[k][offX], Qs[k][offQ] );
59
60 __syncthreads();
61 }
62
63 //compare to previous min and store into shared mem if needed.
64 if(xB+offX<groupCount && ans<min[offQ][offX]){
65 min[offQ][offX]=ans;
66 minPos[offQ][offX]= xMap.mat[IDX( xG, xB+offX, xMap.ld )];
67 }
68 __syncthreads();
69 }
70 }
71
72 //Reduce across threads
73 for(i=BLOCK_SIZE/2; i>0; i/=2){
74 if( offX<i ){
75 if( min[offQ][offX+i] < min[offQ][offX] ){
76 min[offQ][offX] = min[offQ][offX+i];
77 minPos[offQ][offX] = minPos[offQ][offX+i];
78 }
79 }
80 __syncthreads();
81 }
82
83 if(offX==0 && qMap[qB+offQ]!=DUMMY_IDX){
84 dMins[qMap[qB+offQ]] = min[offQ][0];
85 dMinIDs[qMap[qB+offQ]] = minPos[offQ][0];
86 }
87 }
88
89
90
91 __global__ void nnKernel(const matrix Q, unint numDone, const matrix X, real *dMins, unint *dMinIDs){
92
93 unint qB = blockIdx.y * BLOCK_SIZE + numDone; //indexes Q
94 unint xB; //indexes X;
95 unint cB; //colBlock
96 unint offQ = threadIdx.y; //the offset of qPos in this block
97 unint offX = threadIdx.x; //ditto for x
98 unint i;
99 real ans;
100
101 __shared__ real min[BLOCK_SIZE][BLOCK_SIZE];
102 __shared__ unint minPos[BLOCK_SIZE][BLOCK_SIZE];
103
104 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
105 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
106
107 min[offQ][offX]=MAX_REAL;
108 __syncthreads();
109
110 for(xB=0; xB<X.pr; xB+=BLOCK_SIZE){
111 ans=0;
112 for(cB=0; cB<X.pc; cB+=BLOCK_SIZE){
113
114 //Each thread loads one element of X and Q into memory.
115 Xs[offX][offQ] = X.mat[IDX( xB+offQ, cB+offX, X.ld )];
116 Qs[offX][offQ] = Q.mat[IDX( qB+offQ, cB+offX, Q.ld )];
117
118 __syncthreads();
119
120 for(i=0;i<BLOCK_SIZE;i++)
121 ans += DIST( Xs[i][offX], Qs[i][offQ] );
122
123 __syncthreads();
124 }
125
126 if( xB+offX<X.r && ans<min[offQ][offX] ){
127 minPos[offQ][offX] = xB+offX;
128 min[offQ][offX] = ans;
129 }
130 }
131 __syncthreads();
132
133
134 //reduce across threads
135 for(i=BLOCK_SIZE/2; i>0; i/=2){
136 if(offX<i){
137 if(min[offQ][offX+i]<min[offQ][offX]){
138 min[offQ][offX] = min[offQ][offX+i];
139 minPos[offQ][offX] = minPos[offQ][offX+i];
140 }
141 }
142 __syncthreads();
143 }
144
145 if(offX==0){
146 dMins[qB+offQ] = min[offQ][0];
147 dMinIDs[qB+offQ] = minPos[offQ][0];
148 }
149 }
150
151
152 __global__ void knnKernel(const matrix Q, unint numDone, const matrix X, matrix dMins, intMatrix dMinIDs){
153
154 unint qB = blockIdx.y * BLOCK_SIZE + numDone; //indexes Q
155 unint xB; //indexes X;
156 unint cB; //colBlock
157 unint offQ = threadIdx.y; //the offset of qPos in this block
158 unint offX = threadIdx.x; //ditto for x
159 unint i;
160 real ans;
161
162 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
163 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
164
165 __shared__ real dNN[BLOCK_SIZE][K+BLOCK_SIZE];
166 __shared__ unint idNN[BLOCK_SIZE][K+BLOCK_SIZE];
167
168 dNN[offQ][offX] = MAX_REAL;
169 dNN[offQ][offX+16] = MAX_REAL;
170 idNN[offQ][offX] = DUMMY_IDX;
171 idNN[offQ][offX+16] = DUMMY_IDX;
172
173 __syncthreads();
174
175 for(xB=0; xB<X.pr; xB+=BLOCK_SIZE){
176 ans=0;
177 for(cB=0; cB<X.pc; cB+=BLOCK_SIZE){
178
179 //Each thread loads one element of X and Q into memory.
180 Xs[offX][offQ] = X.mat[IDX( xB+offQ, cB+offX, X.ld )];
181 Qs[offX][offQ] = Q.mat[IDX( qB+offQ, cB+offX, Q.ld )];
182 __syncthreads();
183
184 for(i=0;i<BLOCK_SIZE;i++)
185 ans += DIST( Xs[i][offX], Qs[i][offQ] );
186
187 __syncthreads();
188 }
189
190 dNN[offQ][offX+32] = (xB+offX<X.r)? ans:MAX_REAL;
191 idNN[offQ][offX+32] = xB + offX;
192 __syncthreads();
193
194 sort16off( dNN, idNN );
195 __syncthreads();
196
197 merge32x16( dNN, idNN );
198 }
199 __syncthreads();
200
201 dMins.mat[IDX(qB+offQ, offX, dMins.ld)] = dNN[offQ][offX];
202 dMins.mat[IDX(qB+offQ, offX+16, dMins.ld)] = dNN[offQ][offX+16];
203 dMinIDs.mat[IDX(qB+offQ, offX, dMins.ld)] = idNN[offQ][offX];
204 dMinIDs.mat[IDX(qB+offQ, offX+16, dMins.ld)] = idNN[offQ][offX+16];
205
206 }
207
208
209 __global__ void dist1Kernel(const matrix Q, unint qStart, const matrix X, unint xStart, matrix D){
210 unint c, i, j;
211
212 unint qB = blockIdx.y*BLOCK_SIZE + qStart;
213 unint q = threadIdx.y;
214 unint xB = blockIdx.x*BLOCK_SIZE + xStart;
215 unint x = threadIdx.x;
216
217 real ans=0;
218
219 //This thread is responsible for computing the dist between Q[qB+q] and X[xB+x]
220
221 __shared__ real Qs[BLOCK_SIZE][BLOCK_SIZE];
222 __shared__ real Xs[BLOCK_SIZE][BLOCK_SIZE];
223
224
225 for(i=0 ; i<Q.pc/BLOCK_SIZE ; i++){
226 c=i*BLOCK_SIZE; //current col block
227
228 Qs[x][q] = Q.mat[ IDX(qB+q, c+x, Q.ld) ];
229 Xs[x][q] = X.mat[ IDX(xB+q, c+x, X.ld) ];
230
231 __syncthreads();
232
233 for(j=0 ; j<BLOCK_SIZE ; j++)
234 ans += DIST( Qs[j][q], Xs[j][x] );
235
236 __syncthreads();
237 }
238
239 D.mat[ IDX( qB+q, xB+x, D.ld ) ] = ans;
240
241 }
242
243
244
245 __global__ void findRangeKernel(const matrix D, unint numDone, real *ranges, unint cntWant){
246
247 unint row = blockIdx.y*(BLOCK_SIZE/4)+threadIdx.y + numDone;
248 unint ro = threadIdx.y;
249 unint co = threadIdx.x;
250 unint i, c;
251 real t;
252
253 const unint LB = (90*cntWant)/100 ;
254 const unint UB = cntWant;
255
256 __shared__ real smin[BLOCK_SIZE/4][4*BLOCK_SIZE];
257 __shared__ real smax[BLOCK_SIZE/4][4*BLOCK_SIZE];
258
259 real min=MAX_REAL;
260 real max=0;
261 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
262 if( c+co < D.c ){
263 t = D.mat[ IDX( row, c+co, D.ld ) ];
264 min = MIN(t,min);
265 max = MAX(t,max);
266 }
267 }
268
269 smin[ro][co] = min;
270 smax[ro][co] = max;
271 __syncthreads();
272
273 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
274 if( co < i ){
275 smin[ro][co] = MIN( smin[ro][co], smin[ro][co+i] );
276 smax[ro][co] = MAX( smax[ro][co], smax[ro][co+i] );
277 }
278 __syncthreads();
279 }
280
281 //Now start range counting.
282
283 unint itcount=0;
284 unint cnt;
285 real rg;
286 __shared__ unint scnt[BLOCK_SIZE/4][4*BLOCK_SIZE];
287 __shared__ char cont[BLOCK_SIZE/4];
288
289 if(co==0)
290 cont[ro]=1;
291
292 do{
293 itcount++;
294 __syncthreads();
295
296 if( cont[ro] ) //if we didn't actually need to cont, leave rg as it was.
297 rg = ( smax[ro][0] + smin[ro][0] ) / ((real)2.0) ;
298
299 cnt=0;
300 for(c=0 ; c<D.pc ; c+=(4*BLOCK_SIZE)){
301 cnt += (c+co < D.c && row < D.r && D.mat[ IDX( row, c+co, D.ld ) ] <= rg);
302 }
303
304 scnt[ro][co] = cnt;
305 __syncthreads();
306
307 for(i=2*BLOCK_SIZE ; i>0 ; i/=2){
308 if( co < i ){
309 scnt[ro][co] += scnt[ro][co+i];
310 }
311 __syncthreads();
312 }
313
314 if(co==0){
315 if( scnt[ro][0] < cntWant )
316 smin[ro][0]=rg;
317 else
318 smax[ro][0]=rg;
319 }
320
321 // cont[ro] == this row needs to continue
322 if(co==0)
323 cont[ro] = row<D.r && ( scnt[ro][0] < LB || scnt[ro][0] > UB );
324 __syncthreads();
325
326 // Determine if *any* of the rows need to continue
327 for(i=BLOCK_SIZE/8 ; i>0 ; i/=2){
328 if( ro < i && co==0)
329 cont[ro] |= cont[ro+i];
330 __syncthreads();
331 }
332
333 } while(cont[0]);
334
335 if(co==0 && row<D.r )
336 ranges[row]=rg;
337
338 }
339
340
341 __global__ void rangeSearchKernel(const matrix D, unint xOff, unint yOff, const real *ranges, charMatrix ir){
342 unint col = blockIdx.x*BLOCK_SIZE + threadIdx.x + xOff;
343 unint row = blockIdx.y*BLOCK_SIZE + threadIdx.y + yOff;
344
345 ir.mat[IDX( row, col, ir.ld )] = D.mat[IDX( row, col, D.ld )] < ranges[row];
346
347 }
348
349
350 __global__ void rangeCountKernel(const matrix Q, unint numDone, const matrix X, real *ranges, unint *counts){
351 unint q = blockIdx.y*BLOCK_SIZE + numDone;
352 unint qo = threadIdx.y;
353 unint xo = threadIdx.x;
354
355 real rg = ranges[q+qo];
356
357 unint r,c,i;
358
359 __shared__ unint scnt[BLOCK_SIZE][BLOCK_SIZE];
360
361 __shared__ real xs[BLOCK_SIZE][BLOCK_SIZE];
362 __shared__ real qs[BLOCK_SIZE][BLOCK_SIZE];
363
364 unint cnt=0;
365 for( r=0; r<X.pr; r+=BLOCK_SIZE ){
366
367 real dist=0;
368 for( c=0; c<X.pc; c+=BLOCK_SIZE){
369 xs[xo][qo] = X.mat[IDX( r+qo, c+xo, X.ld )];
370 qs[xo][qo] = Q.mat[IDX( q+qo, c+xo, Q.ld )];
371 __syncthreads();
372
373 for( i=0; i<BLOCK_SIZE; i++)
374 dist += DIST( xs[i][xo], qs[i][qo] );
375
376 __syncthreads();
377
378 }
379 cnt += r+xo<X.r && dist<rg;
380
381 }
382
383 scnt[qo][xo]=cnt;
384 __syncthreads();
385
386 for( i=BLOCK_SIZE/2; i>0; i/=2 ){
387 if( xo<i ){
388 scnt[qo][xo] += scnt[qo][xo+i];
389 }
390 __syncthreads();
391 }
392
393 if( xo==0 && q+qo<Q.r )
394 counts[q+qo] = scnt[qo][0];
395 }
396
397
398 __device__ void sort16off(real x[][48], unint xi[][48]){
399 int i = threadIdx.x;
400 int j = threadIdx.y;
401
402 if(i%2==0)
403 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
404 __syncthreads();
405
406 if(i%4<2)
407 mmGateI( x[j]+K+i, x[j]+K+i+2, xi[j]+K+i, xi[j]+K+i+2 );
408 __syncthreads();
409
410 if(i%4==1)
411 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
412 __syncthreads();
413
414 if(i%8<4)
415 mmGateI( x[j]+K+i, x[j]+K+i+4, xi[j]+K+i, xi[j]+K+i+4 );
416 __syncthreads();
417
418 if(i%8==2 || i%8==3)
419 mmGateI( x[j]+K+i, x[j]+K+i+2, xi[j]+K+i, xi[j]+K+i+2 );
420 __syncthreads();
421
422 if( i%2 && i%8 != 7 )
423 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
424 __syncthreads();
425
426 //0-7; 8-15 now sorted. merge time.
427 if( i<8)
428 mmGateI( x[j]+K+i, x[j]+K+i+8, xi[j]+K+i, xi[j]+K+i+8 );
429 __syncthreads();
430
431 if( i>3 && i<8 )
432 mmGateI( x[j]+K+i, x[j]+K+i+4, xi[j]+K+i, xi[j]+K+i+4 );
433 __syncthreads();
434
435 int os = (i/2)*4+2 + i%2;
436 if(i<6)
437 mmGateI( x[j]+K+os, x[j]+K+os+2, xi[j]+K+os, xi[j]+K+os+2 );
438 __syncthreads();
439
440 if( i%2 && i<15)
441 mmGateI( x[j]+K+i, x[j]+K+i+1, xi[j]+K+i, xi[j]+K+i+1 );
442 }
443
444
445 __device__ void sort16(real x[][16], unint xi[][16]){
446 int i = threadIdx.x;
447 int j = threadIdx.y;
448
449 if(i%2==0)
450 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
451 __syncthreads();
452
453 if(i%4<2)
454 mmGateI( x[j]+i, x[j]+i+2, xi[j]+i, xi[j]+i+2 );
455 __syncthreads();
456
457 if(i%4==1)
458 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
459 __syncthreads();
460
461 if(i%8<4)
462 mmGateI( x[j]+i, x[j]+i+4, xi[j]+i, xi[j]+i+4 );
463 __syncthreads();
464
465 if(i%8==2 || i%8==3)
466 mmGateI( x[j]+i, x[j]+i+2, xi[j]+i, xi[j]+i+2 );
467 __syncthreads();
468
469 if( i%2 && i%8 != 7 )
470 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
471 __syncthreads();
472
473 //0-7; 8-15 now sorted. merge time.
474 if( i<8)
475 mmGateI( x[j]+i, x[j]+i+8, xi[j]+i, xi[j]+i+8 );
476 __syncthreads();
477
478 if( i>3 && i<8 )
479 mmGateI( x[j]+i, x[j]+i+4, xi[j]+i, xi[j]+i+4 );
480 __syncthreads();
481
482 int os = (i/2)*4+2 + i%2;
483 if(i<6)
484 mmGateI( x[j]+os, x[j]+os+2, xi[j]+os, xi[j]+os+2 );
485 __syncthreads();
486
487 if( i%2 && i<15)
488 mmGateI( x[j]+i, x[j]+i+1, xi[j]+i, xi[j]+i+1 );
489
490 }
491
492
493 __device__ void merge32x16(real x[][48], unint xi[][48]){
494 int i = threadIdx.x;
495 int j = threadIdx.y;
496
497 mmGateI( x[j]+i, x[j]+i+32, xi[j]+i, xi[j]+i+32 );
498 __syncthreads();
499
500 mmGateI( x[j]+i+16, x[j]+i+32, xi[j]+i+16, xi[j]+i+32 );
501 __syncthreads();
502
503 int os = (i<8)? 24: 0;
504 mmGateI( x[j]+os+i, x[j]+os+i+8, xi[j]+os+i, xi[j]+os+i+8 );
505 __syncthreads();
506
507 os = (i/4)*8+4 + i%4;
508 mmGateI( x[j]+os, x[j]+os+4, xi[j]+os, xi[j]+os+4 );
509 if(i<4)
510 mmGateI(x[j]+36+i, x[j]+36+i+4, xi[j]+36+i, xi[j]+36+i+4 );
511 __syncthreads();
512
513 os = (i/2)*4+2 + i%2;
514 mmGateI( x[j]+os, x[j]+os+2, xi[j]+os, xi[j]+os+2 );
515
516 os = (i/2)*4+34 + i%2;
517 if(i<6)
518 mmGateI( x[j]+os, x[j]+os+2, xi[j]+os, xi[j]+os+2 );
519 __syncthreads();
520
521 os = 2*i+1;
522 mmGateI(x[j]+os, x[j]+os+1, xi[j]+os, xi[j]+os+1 );
523
524 os = 2*i+33;
525 if(i<7)
526 mmGateI(x[j]+os, x[j]+os+1, xi[j]+os, xi[j]+os+1 );
527
528 }
529
530
531 __device__ void mmGateI(real *x, real *y, unint *xi, unint *yi){
532 int ti = MINi( *x, *y, *xi, *yi );
533 *yi = MAXi( *x, *y, *xi, *yi );
534 *xi = ti;
535 real t = MIN( *x, *y );
536 *y = MAX( *x, *y );
537 *x = t;
538 }
539
540 #endif