Modified the scan kernel so that it can work with > 1024*1024 numbers
[RBC.git] / rbc.cu
1 #ifndef RBC_CU
2 #define RBC_CU
3
4 #include<sys/time.h>
5 #include<stdio.h>
6 #include<cuda.h>
7 #include "utils.h"
8 #include "defs.h"
9 #include "utilsGPU.h"
10 #include "rbc.h"
11 #include "kernels.h"
12 #include "kernelWrap.h"
13 #include "sKernel.h"
14
15 void rbc(matrix x, matrix q, int numReps, int s, int *NNs){
16 int i;
17 matrix r; // random subset of x
18 matrix dr; //device version of r
19 int *repIDsQ; //assigment of each pt to a rep.
20 real *radiiQ; //radius of each group
21 int *groupCountX, *groupCountQ; //num pts in each group
22 real *distToRepsQ; //distance of each pt to its rep
23 int *qID;
24 charMatrix cM; //compute matrix. will be treated as a binary matrix
25 intMatrix xmap;
26 compPlan cP;
27 int compLength;
28 struct timeval tvB, tvE;
29
30 //convenience variables
31 int n = x.r; //num pts in DB
32 int m = q.r; //num queries
33 int pnr = PAD(numReps);
34
35 r.r=numReps; r.c=x.c; r.pr=pnr; r.pc=PAD(r.c); r.ld=r.pc;
36 r.mat = (real*)calloc(r.pr*r.pc,sizeof(*(r.mat)));
37
38 repIDsQ = (int*)calloc(m,sizeof(*repIDsQ));
39 radiiQ = (real*)calloc(pnr,sizeof(*radiiQ));
40 groupCountX = (int*)calloc(pnr,sizeof(*groupCountX));
41 groupCountQ = (int*)calloc(pnr,sizeof(*groupCountQ));
42 distToRepsQ = (real*)calloc(m,sizeof(*distToRepsQ));
43
44 qID = (int*)calloc(PAD(m+(BLOCK_SIZE-1)*pnr),sizeof(*qID));
45
46 cM.mat = (char*)calloc(pnr*pnr,sizeof(*cM.mat));
47 cM.r=numReps; cM.c=numReps; cM.pr=pnr; cM.pc=pnr; cM.ld=cM.pc;
48
49 xmap.r=numReps; xmap.pr=PAD(numReps); xmap.c=s; xmap.pc=PAD(s); xmap.ld=xmap.pc;
50 cudaMalloc( (void**)&xmap.mat, xmap.pr*xmap.pc*sizeof(*xmap.mat) );
51
52 //The following lines of code init xmap to 0s.
53 intMatrix tempXmap;
54 tempXmap.r=xmap.r; tempXmap.pr=xmap.pr; tempXmap.c=xmap.c; tempXmap.pc=xmap.pc; tempXmap.ld=xmap.ld;
55 tempXmap.mat = (int*)calloc( tempXmap.pr*tempXmap.pc, sizeof(*tempXmap.mat) );
56 cudaMemcpy(xmap.mat, tempXmap.mat, xmap.pr*xmap.pc*sizeof(*xmap.mat), cudaMemcpyHostToDevice);
57 free(tempXmap.mat);
58
59
60 //Choose representatives and move them to device
61 int *randInds;
62 randInds = (int*)calloc(pnr,sizeof(*randInds));
63 subRandPerm(numReps,n,randInds);
64
65 for(i=0;i<numReps;i++){
66 copyVector(&r.mat[IDX(i,0,r.ld)], &x.mat[IDX(randInds[i],0,x.ld)], x.c);
67 }
68 free(randInds);
69
70 dr.r=r.r; dr.c=r.c; dr.pr=r.pr; dr.pc=r.pc; dr.ld=r.ld;
71 cudaMalloc( (void**)&(dr.mat), dr.pr*dr.pc*sizeof(*(dr.mat)) );
72 cudaMemcpy(dr.mat,r.mat,dr.pr*dr.pc*sizeof(*(dr.mat)),cudaMemcpyHostToDevice);
73
74
75 matrix dx;
76 copyAndMove(&dx, &x);
77
78 build(dx, dr, xmap, groupCountX, s);
79
80 /* tempXmap.r=xmap.r; tempXmap.pr=xmap.pr; tempXmap.c=xmap.c; tempXmap.pc=xmap.pc; tempXmap.ld=xmap.ld; */
81 /* tempXmap.mat = (int*)calloc( tempXmap.pr*tempXmap.pc, sizeof(*tempXmap.mat) ); */
82 /* cudaMemcpy(tempXmap.mat, xmap.mat, xmap.pr*xmap.pc*sizeof(*xmap.mat), cudaMemcpyDeviceToHost); */
83 /* for( i=0; i<16; i++ ) */
84 /* printf("%d ",tempXmap.mat[i]); */
85 /* printf("\n"); */
86 /* free(tempXmap.mat); */
87
88
89 gettimeofday(&tvB,NULL); //Start the timer for the queries
90
91 matrix dqOrig;
92 copyAndMove(&dqOrig, &q);
93
94 computeReps(dqOrig, dr, repIDsQ, distToRepsQ);
95
96 //How many points are assigned to each group?
97 computeCounts(repIDsQ, m, groupCountQ);
98
99
100 //Set up the mapping from groups to queries (qID).
101 buildQMap(q, qID, repIDsQ, numReps, &compLength);
102
103 // Determine which blocks need to be compared with one another and
104 // store the results in the computation matrix cM.
105 //blockIntersection(cM, dr, radiiX, radiiQ);
106 idIntersection(cM);
107
108 // Setup the compute plan
109 initCompPlan(&cP, cM, groupCountQ, groupCountX, numReps);
110
111 // Compute the NNs according to the compute plan
112 computeNNs(dx, dqOrig, xmap, cP, qID, NNs, compLength);
113
114 gettimeofday(&tvE,NULL);
115 printf("\t.. time elapsed (for queries) = %6.4f \n",timeDiff(tvB,tvE));
116
117 cudaFree(dr.mat);
118 cudaFree(dqOrig.mat);
119 freeCompPlan(&cP);
120 free(cM.mat);
121 free(distToRepsQ);
122 free(groupCountX);
123 free(groupCountQ);
124 free(qID);
125 free(radiiQ);
126 free(repIDsQ);
127 free(r.mat);
128 cudaFree(xmap.mat);
129 cudaFree(dx.mat);
130 }
131
132
133 void build(const matrix dx, const matrix dr, intMatrix xmap, int *counts, int s){
134
135 int n = dx.pr;
136 int p = dr.r;
137
138 //Figure out how much fits into memory
139 unsigned int memFree, memTot;
140 cuMemGetInfo(&memFree, &memTot);
141 memFree = (int)(((float)memFree)*MEM_USABLE);
142
143 //mem needed per rep = n*sizeof(real)+n*sizeof(char)+n*sizeof(int)+sizeof(real)+sizeof(int)
144 // = dist mat +ir +dSums +range +dCnts
145 int ptsAtOnce = DPAD(memFree/((n+1)*sizeof(real) + n*sizeof(char) +(n+1)*sizeof(int)));
146 if(ptsAtOnce==0){
147 fprintf(stderr,"memfree = %d \n",memFree);
148 fprintf(stderr,"error: not enough memory to build the RBC.. exiting\n");
149 exit(1);
150 }
151
152 matrix dD;
153 dD.pr=dD.r=ptsAtOnce; dD.c=dx.r; dD.pc=dx.pr; dD.ld=dD.pc;
154 cudaMalloc( (void**)&dD.mat, dD.pr*dD.pc*sizeof(*dD.mat) );
155
156 real *dranges;
157 cudaMalloc( (void**)&dranges, ptsAtOnce*sizeof(real) );
158
159 charMatrix ir;
160 ir.r=dD.r; ir.pr=dD.pr; ir.c=dD.c; ir.pc=dD.pc; ir.ld=dD.ld;
161 ir.mat = (char*)calloc( ir.pr*ir.pc, sizeof(*ir.mat) );
162 charMatrix dir;
163 copyAndMoveC(&dir, &ir);
164
165 intMatrix dSums; //used to compute memory addresses.
166 dSums.r=dir.r; dSums.pr=dir.pr; dSums.c=dir.c; dSums.pc=dir.pc; dSums.ld=dir.ld;
167 cudaMalloc( (void**)&dSums.mat, dSums.pc*dSums.pr*sizeof(*dSums.mat) );
168
169 int *dCnts;
170 cudaMalloc( (void**)&dCnts, ptsAtOnce*sizeof(*dCnts) );
171
172
173 int numits=0;
174 int numLeft = p; //points left to process
175 int row = 0; //base row for iteration of while loop
176 int pi, pip; //pi=pts per it, pip=pad(pi)
177
178 while( numLeft > 0 ){
179 numits++;
180 pi = MIN(ptsAtOnce, numLeft); //points to do this iteration.
181 pip = PAD(pi);
182 dD.r = pi; dD.pr = pip; dir.r=pi; dir.pr=pip; dSums.r=pi; dSums.pr=pip;
183
184 distSubMat(dr, dx, dD, row, pip); //compute the distance matrix
185 findRangeWrap(dD, dranges, s); //find an appropriate range
186 rangeSearchWrap(dD, dranges, dir); //set binary vector for points in range
187
188 sumWrap(dir, dSums); //This and the next call perform the parallel compaction.
189
190 buildMapWrap(xmap, dir, dSums, row);
191 getCountsWrap(dCnts,dir,dSums); //How many points are assigned to each rep? It is not
192 //*exactly* s, which is why we need to compute this.
193 cudaMemcpy( &counts[row], dCnts, pi*sizeof(*counts), cudaMemcpyDeviceToHost );
194
195 numLeft -= pi;
196 row += pi;
197 }
198
199 cudaFree(dCnts);
200 free(ir.mat);
201 cudaFree(dranges);
202 cudaFree(dir.mat);
203 cudaFree(dSums.mat);
204 cudaFree(dD.mat);
205 }
206
207
208 //Assign each point in dq to its nearest point in dr.
209 void computeReps(matrix dq, matrix dr, int *repIDs, real *distToReps){
210 real *dMins;
211 int *dMinIDs;
212
213 cudaMalloc((void**)&(dMins), dq.pr*sizeof(*dMins));
214 cudaMalloc((void**)&(dMinIDs), dq.pr*sizeof(*dMinIDs));
215
216 nnWrap(dq,dr,dMins,dMinIDs);
217
218 cudaMemcpy(distToReps,dMins,dq.r*sizeof(*dMins),cudaMemcpyDeviceToHost);
219 cudaMemcpy(repIDs,dMinIDs,dq.r*sizeof(*dMinIDs),cudaMemcpyDeviceToHost);
220
221 cudaFree(dMins);
222 cudaFree(dMinIDs);
223 }
224
225
226
227 //Assumes radii is initialized to 0s
228 void computeRadii(int *repIDs, real *distToReps, real *radii, int n, int numReps){
229 int i;
230
231 for(i=0;i<n;i++)
232 radii[repIDs[i]] = MAX(distToReps[i],radii[repIDs[i]]);
233 }
234
235
236 //Assumes groupCount is initialized to 0s
237 void computeCounts(int *repIDs, int n, int *groupCount){
238 int i;
239
240 for(i=0;i<n;i++)
241 groupCount[repIDs[i]]++;
242 }
243
244
245 // This function computes a cumulative sum of the groupCounts.
246 // Assumes groupOff is initialized to 0s.
247 void computeOffsets(int *groupCount, int n, int *groupOff){
248 int i;
249
250 for(i=1;i<n;i++)
251 groupOff[i]=groupOff[i-1]+PAD(groupCount[i-1]);
252 }
253
254
255 // This function sorts points by their repID. It makes two passes through the
256 // matrix x; one to count the bucket sizes, the next to place points in the
257 // correct bucket. Note that this function unfortunately allocates a temporary
258 // matrix the size of x, then copies the results over to x at the end. The
259 // sort could be done in place, eg by doing numReps passes through x instead of 2.
260 void groupPoints(matrix x, int *xID, int *repIDs, int numReps){
261 matrix y;
262 int n=x.r;
263 int d=x.c;
264 int i;
265 int *gS; //groupSize
266 int *yID;
267
268 yID = (int*)calloc(n,sizeof(*yID));
269 y.mat = (real*)calloc(n*d,sizeof(*y.mat));
270 gS = (int*)calloc(numReps+1,sizeof(*gS));
271
272 y.r=n; y.pr=n; y.c=d; y.pc=d; y.ld=d;
273
274 for(i=0;i<n;i++)
275 gS[repIDs[i]+1]++;
276 for(i=1;i<numReps;i++)
277 gS[i]=gS[i-1]+gS[i];
278
279 for(i=0;i<n;i++){
280 copyVector(&y.mat[IDX(gS[repIDs[i]],0,y.ld)], &x.mat[IDX(i,0,x.ld)],d);
281 yID[gS[repIDs[i]]]=xID[i];
282 gS[repIDs[i]]++;
283 }
284
285 for(i=0;i<n;i++){
286 copyVector(&x.mat[IDX(i,0,x.ld)],&y.mat[IDX(i,0,y.ld)],d);
287 xID[i]=yID[i];
288 }
289
290 free(yID);
291 free(gS);
292 free(y.mat);
293 }
294
295
296 void buildQMap(matrix q, int *qID, int *repIDs, int numReps, int *compLength){
297 int n=q.r;
298 int i;
299 int *gS; //groupSize
300
301 gS = (int*)calloc(numReps+1,sizeof(*gS));
302
303 for( i=0; i<n; i++ )
304 gS[repIDs[i]+1]++;
305 for( i=0; i<numReps+1; i++ )
306 gS[i]=PAD(gS[i]);
307
308 for( i=1; i<numReps+1; i++ )
309 gS[i]=gS[i-1]+gS[i];
310
311 *compLength = gS[numReps];
312
313 for( i=0; i<(*compLength); i++ )
314 qID[i]=DUMMY_IDX;
315
316 for( i=0; i<n; i++ ){
317 qID[gS[repIDs[i]]]=i;
318 gS[repIDs[i]]++;
319 }
320
321 free(gS);
322 }
323
324
325 void blockIntersection(charMatrix cM, matrix dr, real *radiiX, real *radiiQ){
326 matrix dD;
327 real *dradiiX, *dradiiQ;
328 int pnR = dr.pr;
329 charMatrix dcM;
330
331 dD.r=dD.c=dr.r; dD.pr=dD.pc=dD.ld=dr.pr;
332 dcM.r=cM.r; dcM.c=cM.c; dcM.pr=cM.pr; dcM.pc=cM.pc; dcM.ld=cM.ld;
333
334 cudaMalloc((void**)&dD.mat, pnR*pnR*sizeof(*dD.mat));
335 cudaMalloc((void**)&dradiiX, pnR*sizeof(*dradiiX));
336 cudaMalloc((void**)&dradiiQ, pnR*sizeof(*dradiiQ));
337 cudaMalloc((void**)&dcM.mat, dcM.pr*dcM.pc*sizeof(*dcM.mat));
338
339 // Copying over the radii. Note that everything after the first dr.r places
340 // on the device variables is undefined.
341 cudaMemcpy(dradiiX,radiiX,dr.r*sizeof(*dradiiX),cudaMemcpyHostToDevice);
342 cudaMemcpy(dradiiQ,radiiQ,dr.r*sizeof(*dradiiQ),cudaMemcpyHostToDevice);
343
344 dist1Wrap(dr, dr, dD);
345 pruneWrap(dcM, dD, dradiiX, dradiiQ);
346
347 cudaMemcpy(cM.mat,dcM.mat,pnR*pnR*sizeof(*dcM.mat),cudaMemcpyDeviceToHost);
348
349 cudaFree(dcM.mat);
350 cudaFree(dradiiQ);
351 cudaFree(dradiiX);
352 cudaFree(dD.mat);
353 }
354
355
356 // Sets the computation matrix to the identity.
357 void idIntersection(charMatrix cM){
358 int i;
359 for(i=0;i<cM.r;i++){
360 if(i<cM.c)
361 cM.mat[IDX(i,i,cM.ld)]=1;
362 }
363 }
364
365
366
367 void fullIntersection(charMatrix cM){
368 int i,j;
369 for(i=0;i<cM.r;i++){
370 for(j=0;j<cM.c;j++){
371 cM.mat[IDX(i,j,cM.ld)]=1;
372 }
373 }
374 }
375
376
377 // Danger: this function allocates memory that it does not free.
378 // Use freeCompPlan to clear mem.
379 void initCompPlan(compPlan *cP, charMatrix cM, int *groupCountQ, int *groupCountX, int numReps){
380 int i,j,k;
381 int maxNumGroups=0;
382 int *groupOff;
383
384 groupOff = (int*)calloc(numReps,sizeof(*groupOff));
385
386 cP->sNumGroups=numReps;
387 cP->numGroups = (int*)calloc(cP->sNumGroups,sizeof(*cP->numGroups));
388
389 for(i=0;i<numReps;i++){
390 cP->numGroups[i] = 0;
391 for(j=0;j<numReps;j++){
392 cP->numGroups[i] += cM.mat[IDX(i,j,cM.ld)];
393 }
394 maxNumGroups = MAX(cP->numGroups[i],maxNumGroups);
395 }
396
397 cP->ld = maxNumGroups;
398
399 cP->sGroupCountX = maxNumGroups*numReps;
400 cP->groupCountX = (int*)calloc(cP->sGroupCountX,sizeof(*cP->groupCountX));
401 cP->sGroupOff = maxNumGroups*numReps;
402 cP->groupOff = (int*)calloc(cP->sGroupOff,sizeof(*cP->groupOff));
403
404 computeOffsets(groupCountX,numReps,groupOff);
405
406 int tempSize=0;
407 for(i=0;i<numReps;i++)
408 tempSize+=PAD(groupCountQ[i]);
409
410 cP->sQToGroup = tempSize;
411 cP->qToGroup = (int*)calloc(cP->sQToGroup,sizeof(*cP->qToGroup));
412
413 for(i=0, k=0;i<numReps;i++){
414 for(j=0;j<PAD(groupCountQ[i]);j++){
415 cP->qToGroup[k]=i;
416 k++;
417 }
418 }
419
420 for(i=0;i<numReps;i++){
421 for(j=0, k=0;j<numReps;j++){
422 if(cM.mat[IDX(i,j,cM.ld)]){
423 cP->groupCountX[IDX(i,k,cP->ld)]=groupCountX[j];
424 cP->groupOff[IDX(i,k,cP->ld)]=groupOff[j];
425 k++;
426 }
427 }
428 }
429 free(groupOff);
430 }
431
432
433 //Frees memory allocated in initCompPlan.
434 void freeCompPlan(compPlan *cP){
435 free(cP->numGroups);
436 free(cP->groupCountX);
437 free(cP->groupOff);
438 free(cP->qToGroup);
439 }
440
441
442 void computeNNs(matrix dx, matrix dq, intMatrix xmap, compPlan cP, int *qIDs, int *NNs, int compLength){
443 compPlan dcP;
444 real *dMins;
445 int *dMinIDs;
446
447 dcP.ld=cP.ld;
448 cudaMalloc((void**)&dcP.numGroups,cP.sNumGroups*sizeof(*dcP.numGroups));
449 cudaMalloc((void**)&dcP.groupCountX,cP.sGroupCountX*sizeof(*dcP.groupCountX));
450 cudaMalloc((void**)&dcP.groupOff,cP.sGroupOff*sizeof(*dcP.groupOff));
451 cudaMalloc((void**)&dcP.qToGroup,cP.sQToGroup*sizeof(*dcP.qToGroup));
452
453 cudaMemcpy(dcP.numGroups,cP.numGroups,cP.sNumGroups*sizeof(*dcP.numGroups),cudaMemcpyHostToDevice);
454 cudaMemcpy(dcP.groupCountX,cP.groupCountX,cP.sGroupCountX*sizeof(*dcP.groupCountX),cudaMemcpyHostToDevice);
455 cudaMemcpy(dcP.groupOff,cP.groupOff,cP.sGroupOff*sizeof(*dcP.groupOff),cudaMemcpyHostToDevice);
456 cudaMemcpy(dcP.qToGroup,cP.qToGroup,cP.sQToGroup*sizeof(*dcP.qToGroup),cudaMemcpyHostToDevice);
457
458 cudaMalloc((void**)&dMins,compLength*sizeof(*dMins));
459 cudaMalloc((void**)&dMinIDs,compLength*sizeof(*dMinIDs));
460
461 int *dqIDs;
462 cudaMalloc( (void**)&dqIDs, compLength*sizeof(*dqIDs) );
463 cudaMemcpy( dqIDs, qIDs, compLength*sizeof(*dqIDs), cudaMemcpyHostToDevice );
464
465
466 dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
467 dim3 dimGrid;
468 dimGrid.x = 1;
469 int numDone = 0;
470 while( numDone<compLength ){
471 int todo = MIN( (compLength-numDone) , MAX_BS*BLOCK_SIZE );
472
473 dimGrid.y = todo/BLOCK_SIZE;
474 planNNKernel<<<dimGrid,dimBlock>>>(dq,dx,dMins,dMinIDs,dcP,xmap,dqIDs,numDone);
475 cudaThreadSynchronize();
476 numDone += todo;
477 }
478
479 cudaMemcpy( NNs, dMinIDs, dq.r*sizeof(*NNs), cudaMemcpyDeviceToHost);
480
481 cudaFree(dcP.numGroups);
482 cudaFree(dcP.groupCountX);
483 cudaFree(dcP.groupOff);
484 cudaFree(dcP.qToGroup);
485 cudaFree(dMins);
486 cudaFree(dMinIDs);
487 cudaFree(dqIDs);
488 }
489
490
491 //This calls the dist1Kernel wrapper, but has it compute only
492 //a submatrix of the all-pairs distance matrix. In particular,
493 //only distances from dr[start,:].. dr[start+length-1] to all of x
494 //are computed, resulting in a distance matrix of size
495 //length by dx.pr. It is assumed that length is padded.
496 void distSubMat(matrix dr, matrix dx, matrix dD, int start, int length){
497 dr.r=dr.pr=length;
498 dr.mat = &dr.mat[IDX( start, 0, dr.ld )];
499 dist1Wrap(dr, dx, dD);
500 }
501
502
503 #endif