Stable release of RBC
[RBC.git] / rbc.cu
1 #ifndef RBC_CU
2 #define RBC_CU
3
4 #include<sys/time.h>
5 #include<stdio.h>
6 #include<cuda.h>
7 #include "utils.h"
8 #include "defs.h"
9 #include "utilsGPU.h"
10 #include "rbc.h"
11 #include "kernels.h"
12 #include "kernelWrap.h"
13 #include "sKernel.h"
14
15 void rbc(matrix x, matrix q, int numReps, int s, int *NNs){
16 int i;
17 matrix r; // random subset of x
18 matrix dr; //device version of r
19 int *repIDsQ; //assigment of each pt to a rep.
20 real *radiiQ; //radius of each group
21 int *groupCountX, *groupCountQ; //num pts in each group
22 real *distToRepsQ; //distance of each pt to its rep
23 int *qID;
24 charMatrix cM; //compute matrix. will be treated as a binary matrix
25 intMatrix xmap;
26 compPlan cP;
27 int compLength;
28 struct timeval tvB, tvE;
29
30 //convenience variables
31 int n = x.r; //num pts in DB
32 int m = q.r; //num queries
33 int pnr = PAD(numReps);
34
35 r.r=numReps; r.c=x.c; r.pr=pnr; r.pc=PAD(r.c); r.ld=r.pc;
36 r.mat = (real*)calloc(r.pr*r.pc,sizeof(*(r.mat)));
37
38 repIDsQ = (int*)calloc(m,sizeof(*repIDsQ));
39 radiiQ = (real*)calloc(pnr,sizeof(*radiiQ));
40 groupCountX = (int*)calloc(pnr,sizeof(*groupCountX));
41 groupCountQ = (int*)calloc(pnr,sizeof(*groupCountQ));
42 distToRepsQ = (real*)calloc(m,sizeof(*distToRepsQ));
43
44 qID = (int*)calloc(PAD(m+(BLOCK_SIZE-1)*pnr),sizeof(*qID));
45
46 for(i=0;i<m;i++)
47 qID[i]=i;
48
49 cM.mat = (char*)calloc(pnr*pnr,sizeof(*cM.mat));
50 cM.r=numReps; cM.c=numReps; cM.pr=pnr; cM.pc=pnr; cM.ld=cM.pc;
51
52 xmap.r=numReps; xmap.pr=PAD(numReps); xmap.c=s; xmap.pc=PAD(s); xmap.ld=xmap.pc;
53 cudaMalloc( (void**)&xmap.mat, xmap.pr*xmap.pc*sizeof(*xmap.mat) );
54
55 //The following lines of code init xmap to 0s.
56 intMatrix tempXmap;
57 tempXmap.r=xmap.r; tempXmap.pr=xmap.pr; tempXmap.c=xmap.c; tempXmap.pc=xmap.pc; tempXmap.ld=xmap.ld;
58 tempXmap.mat = (int*)calloc( tempXmap.pr*tempXmap.pc, sizeof(*tempXmap.mat) );
59 cudaMemcpy(xmap.mat, tempXmap.mat, xmap.pr*xmap.pc*sizeof(*xmap.mat), cudaMemcpyHostToDevice);
60 free(tempXmap.mat);
61
62
63 //Choose representatives and move them to device
64 int *randInds;
65 randInds = (int*)calloc(pnr,sizeof(*randInds));
66 subRandPerm(numReps,n,randInds);
67
68 for(i=0;i<numReps;i++){
69 copyVector(&r.mat[IDX(i,0,r.ld)], &x.mat[IDX(randInds[i],0,x.ld)], x.c);
70 }
71 free(randInds);
72
73 dr.r=r.r; dr.c=r.c; dr.pr=r.pr; dr.pc=r.pc; dr.ld=r.ld;
74 cudaMalloc( (void**)&(dr.mat), dr.pr*dr.pc*sizeof(*(dr.mat)) );
75 cudaMemcpy(dr.mat,r.mat,dr.pr*dr.pc*sizeof(*(dr.mat)),cudaMemcpyHostToDevice);
76
77
78 matrix dx;
79 copyAndMove(&dx, &x);
80
81 build(dx, dr, xmap, groupCountX, s);
82
83
84 gettimeofday(&tvB,NULL); //Start the timer for the queries
85
86 matrix dqOrig;
87 copyAndMove(&dqOrig, &q);
88
89 computeReps(dqOrig, dr, repIDsQ, distToRepsQ);
90
91 //How many points are assigned to each group?
92 computeCounts(repIDsQ, m, groupCountQ);
93
94 //Set up the mapping from groups to queries (qID).
95 buildQMap(q, qID, repIDsQ, numReps, &compLength);
96
97 // Determine which blocks need to be compared with one another and
98 // store the results in the computation matrix cM.
99 //blockIntersection(cM, dr, radiiX, radiiQ);
100 idIntersection(cM);
101
102 // Setup the compute plan
103 initCompPlan(&cP, cM, groupCountQ, groupCountX, numReps);
104
105 // Compute the NNs according to the compute plan
106 computeNNs(dx, dqOrig, xmap, cP, qID, NNs, compLength);
107
108 gettimeofday(&tvE,NULL);
109 printf("\t.. time elapsed (for queries) = %6.4f \n",timeDiff(tvB,tvE));
110
111 cudaFree(dr.mat);
112 cudaFree(dqOrig.mat);
113 freeCompPlan(&cP);
114 free(cM.mat);
115 free(distToRepsQ);
116 free(groupCountX);
117 free(groupCountQ);
118 free(qID);
119 free(radiiQ);
120 free(repIDsQ);
121 free(r.mat);
122 cudaFree(xmap.mat);
123 cudaFree(dx.mat);
124 }
125
126
127 void build(const matrix dx, const matrix dr, intMatrix xmap, int *counts, int s){
128
129 int n = dx.pr;
130 int p = dr.r;
131
132 //Figure out how much fits into memory
133 unsigned int memFree, memTot;
134 cuMemGetInfo(&memFree, &memTot);
135 memFree = (int)(((float)memFree)*MEM_USABLE);
136
137 //memory needed per rep = n*sizeof(real)+n*sizeof(char)+n*sizeof(int)+sizeof(real)+sizeof(int)
138 // = dist mat +ir +dSums +range +dCnts
139 int ptsAtOnce = DPAD(memFree/((n+1)*sizeof(real) + n*sizeof(char) +(n+1)*sizeof(int)));
140 if(ptsAtOnce==0){
141 fprintf(stderr,"memfree = %d \n",memFree);
142 fprintf(stderr,"error: not enough memory to build the RBC.. exiting\n");
143 exit(1);
144 }
145
146 matrix dD;
147 dD.pr=dD.r=ptsAtOnce; dD.c=dx.r; dD.pc=dx.pr; dD.ld=dD.pc;
148 cudaMalloc( (void**)&dD.mat, dD.pr*dD.pc*sizeof(*dD.mat) );
149
150 real *dranges;
151 cudaMalloc( (void**)&dranges, ptsAtOnce*sizeof(real) );
152
153 charMatrix ir;
154 ir.r=dD.r; ir.pr=dD.pr; ir.c=dD.c; ir.pc=dD.pc; ir.ld=dD.ld;
155 ir.mat = (char*)calloc( ir.pr*ir.pc, sizeof(*ir.mat) );
156 charMatrix dir;
157 copyAndMoveC(&dir, &ir);
158
159 intMatrix dSums; //used to compute memory addresses.
160 dSums.r=dir.r; dSums.pr=dir.pr; dSums.c=dir.c; dSums.pc=dir.pc; dSums.ld=dir.ld;
161 cudaMalloc( (void**)&dSums.mat, dSums.pc*dSums.pr*sizeof(*dSums.mat) );
162
163 int *dCnts;
164 cudaMalloc( (void**)&dCnts, ptsAtOnce*sizeof(*dCnts) );
165
166
167 int numits=0;
168 int numLeft = p; //points left to process
169 int row = 0; //base row for iteration of while loop
170 int pi, pip; //pi=pts per it, pip=pad(pi)
171
172 while( numLeft > 0 ){
173 numits++;
174 pi = MIN(ptsAtOnce, numLeft); //points to do this iteration.
175 pip = PAD(pi);
176 dD.r = pi; dD.pr = pip; dir.r=pi; dir.pr=pip; dSums.r=pi; dSums.pr=pip;
177
178 distSubMat(dr, dx, dD, row, pip); //compute the distance matrix
179 findRangeWrap(dD, dranges, s); //find an appropriate range
180 rangeSearchWrap(dD, dranges, dir); //set binary vector for points in range
181
182 sumWrap(dir, dSums); //This and the next call perform the parallel compaction.
183 buildMapWrap(xmap, dir, dSums, row);
184 getCountsWrap(dCnts,dir,dSums); //How many points are assigned to each rep? It is not
185 //*exactly* s, which is why we need to compute this.
186 cudaMemcpy( &counts[row], dCnts, pi*sizeof(*counts), cudaMemcpyDeviceToHost );
187
188 numLeft -= pi;
189 row += pi;
190 }
191
192 cudaFree(dCnts);
193 free(ir.mat);
194 cudaFree(dranges);
195 cudaFree(dir.mat);
196 cudaFree(dSums.mat);
197 cudaFree(dD.mat);
198 }
199
200
201 //Assign each point in dq to its nearest point in dr.
202 void computeReps(matrix dq, matrix dr, int *repIDs, real *distToReps){
203 real *dMins;
204 int *dMinIDs;
205
206 cudaMalloc((void**)&(dMins), dq.pr*sizeof(*dMins));
207 cudaMalloc((void**)&(dMinIDs), dq.pr*sizeof(*dMinIDs));
208
209 nnWrap(dq,dr,dMins,dMinIDs);
210
211 cudaMemcpy(distToReps,dMins,dq.r*sizeof(*dMins),cudaMemcpyDeviceToHost);
212 cudaMemcpy(repIDs,dMinIDs,dq.r*sizeof(*dMinIDs),cudaMemcpyDeviceToHost);
213
214 cudaFree(dMins);
215 cudaFree(dMinIDs);
216 }
217
218
219
220 //Assumes radii is initialized to 0s
221 void computeRadii(int *repIDs, real *distToReps, real *radii, int n, int numReps){
222 int i;
223
224 for(i=0;i<n;i++)
225 radii[repIDs[i]] = MAX(distToReps[i],radii[repIDs[i]]);
226 }
227
228
229 //Assumes groupCount is initialized to 0s
230 void computeCounts(int *repIDs, int n, int *groupCount){
231 int i;
232
233 for(i=0;i<n;i++)
234 groupCount[repIDs[i]]++;
235 }
236
237
238 // This function computes a cumulative sum of the groupCounts.
239 // Assumes groupOff is initialized to 0s.
240 void computeOffsets(int *groupCount, int n, int *groupOff){
241 int i;
242
243 for(i=1;i<n;i++)
244 groupOff[i]=groupOff[i-1]+PAD(groupCount[i-1]);
245 }
246
247
248 // This function sorts points by their repID. It makes two passes through the
249 // matrix x; one to count the bucket sizes, the next to place points in the
250 // correct bucket. Note that this function unfortunately allocates a temporary
251 // matrix the size of x, then copies the results over to x at the end. The
252 // sort could be done in place, eg by doing numReps passes through x instead of 2.
253 void groupPoints(matrix x, int *xID, int *repIDs, int numReps){
254 matrix y;
255 int n=x.r;
256 int d=x.c;
257 int i;
258 int *gS; //groupSize
259 int *yID;
260
261 yID = (int*)calloc(n,sizeof(*yID));
262 y.mat = (real*)calloc(n*d,sizeof(*y.mat));
263 gS = (int*)calloc(numReps+1,sizeof(*gS));
264
265 y.r=n; y.pr=n; y.c=d; y.pc=d; y.ld=d;
266
267 for(i=0;i<n;i++)
268 gS[repIDs[i]+1]++;
269 for(i=1;i<numReps;i++)
270 gS[i]=gS[i-1]+gS[i];
271
272 for(i=0;i<n;i++){
273 copyVector(&y.mat[IDX(gS[repIDs[i]],0,y.ld)], &x.mat[IDX(i,0,x.ld)],d);
274 yID[gS[repIDs[i]]]=xID[i];
275 gS[repIDs[i]]++;
276 }
277
278 for(i=0;i<n;i++){
279 copyVector(&x.mat[IDX(i,0,x.ld)],&y.mat[IDX(i,0,y.ld)],d);
280 xID[i]=yID[i];
281 }
282
283 free(yID);
284 free(gS);
285 free(y.mat);
286 }
287
288
289 void buildQMap(matrix q, int *qID, int *repIDs, int numReps, int *compLength){
290 int n=q.r;
291 int i;
292 int *gS; //groupSize
293 int *yID;
294
295 gS = (int*)calloc(numReps+1,sizeof(*gS));
296
297 for( i=0; i<n; i++ )
298 gS[repIDs[i]+1]++;
299 for( i=0; i<numReps+1; i++ )
300 gS[i]=PAD(gS[i]);
301
302 for( i=1; i<numReps+1; i++ )
303 gS[i]=gS[i-1]+gS[i];
304
305 *compLength = gS[numReps];
306
307 yID = (int*)calloc((*compLength),sizeof(*yID));
308 for( i=0; i<(*compLength); i++ )
309 yID[i]=DUMMY_IDX;
310
311
312 for( i=0; i<n; i++ ){
313 yID[gS[repIDs[i]]]=qID[i];
314 gS[repIDs[i]]++;
315 }
316
317 for( i=0; i<(*compLength); i++ ){
318 qID[i]=yID[i];
319 }
320
321 free(yID);
322 free(gS);
323 }
324
325
326 void blockIntersection(charMatrix cM, matrix dr, real *radiiX, real *radiiQ){
327 matrix dD;
328 real *dradiiX, *dradiiQ;
329 int pnR = dr.pr;
330 charMatrix dcM;
331
332 dD.r=dD.c=dr.r; dD.pr=dD.pc=dD.ld=dr.pr;
333 dcM.r=cM.r; dcM.c=cM.c; dcM.pr=cM.pr; dcM.pc=cM.pc; dcM.ld=cM.ld;
334
335 cudaMalloc((void**)&dD.mat, pnR*pnR*sizeof(*dD.mat));
336 cudaMalloc((void**)&dradiiX, pnR*sizeof(*dradiiX));
337 cudaMalloc((void**)&dradiiQ, pnR*sizeof(*dradiiQ));
338 cudaMalloc((void**)&dcM.mat, dcM.pr*dcM.pc*sizeof(*dcM.mat));
339
340 // Copying over the radii. Note that everything after the first dr.r places
341 // on the device variables is undefined.
342 cudaMemcpy(dradiiX,radiiX,dr.r*sizeof(*dradiiX),cudaMemcpyHostToDevice);
343 cudaMemcpy(dradiiQ,radiiQ,dr.r*sizeof(*dradiiQ),cudaMemcpyHostToDevice);
344
345 dist1Wrap(dr, dr, dD);
346 pruneWrap(dcM, dD, dradiiX, dradiiQ);
347
348 cudaMemcpy(cM.mat,dcM.mat,pnR*pnR*sizeof(*dcM.mat),cudaMemcpyDeviceToHost);
349
350 cudaFree(dcM.mat);
351 cudaFree(dradiiQ);
352 cudaFree(dradiiX);
353 cudaFree(dD.mat);
354 }
355
356
357 // Sets the computation matrix to the identity.
358 void idIntersection(charMatrix cM){
359 int i;
360 for(i=0;i<cM.r;i++){
361 if(i<cM.c)
362 cM.mat[IDX(i,i,cM.ld)]=1;
363 }
364 }
365
366
367
368 void fullIntersection(charMatrix cM){
369 int i,j;
370 for(i=0;i<cM.r;i++){
371 for(j=0;j<cM.c;j++){
372 cM.mat[IDX(i,j,cM.ld)]=1;
373 }
374 }
375 }
376
377
378 // Danger: this function allocates memory that it does not free.
379 // Use freeCompPlan to clear mem.
380 void initCompPlan(compPlan *cP, charMatrix cM, int *groupCountQ, int *groupCountX, int numReps){
381 int i,j,k;
382 int maxNumGroups=0;
383 int *groupOff;
384
385 groupOff = (int*)calloc(numReps,sizeof(*groupOff));
386
387 cP->sNumGroups=numReps;
388 cP->numGroups = (int*)calloc(cP->sNumGroups,sizeof(*cP->numGroups));
389
390 for(i=0;i<numReps;i++){
391 cP->numGroups[i] = 0;
392 for(j=0;j<numReps;j++){
393 cP->numGroups[i] += cM.mat[IDX(i,j,cM.ld)];
394 }
395 maxNumGroups = MAX(cP->numGroups[i],maxNumGroups);
396 }
397
398 cP->ld = maxNumGroups;
399
400 cP->sGroupCountX = maxNumGroups*numReps;
401 cP->groupCountX = (int*)calloc(cP->sGroupCountX,sizeof(*cP->groupCountX));
402 cP->sGroupOff = maxNumGroups*numReps;
403 cP->groupOff = (int*)calloc(cP->sGroupOff,sizeof(*cP->groupOff));
404
405 computeOffsets(groupCountX,numReps,groupOff);
406
407 int tempSize=0;
408 for(i=0;i<numReps;i++)
409 tempSize+=PAD(groupCountQ[i]);
410
411 cP->sQToGroup = tempSize;
412 cP->qToGroup = (int*)calloc(cP->sQToGroup,sizeof(*cP->qToGroup));
413
414 for(i=0, k=0;i<numReps;i++){
415 for(j=0;j<PAD(groupCountQ[i]);j++){
416 cP->qToGroup[k]=i;
417 k++;
418 }
419 }
420
421 for(i=0;i<numReps;i++){
422 for(j=0, k=0;j<numReps;j++){
423 if(cM.mat[IDX(i,j,cM.ld)]){
424 cP->groupCountX[IDX(i,k,cP->ld)]=groupCountX[j];
425 cP->groupOff[IDX(i,k,cP->ld)]=groupOff[j];
426 k++;
427 }
428 }
429 }
430 free(groupOff);
431 }
432
433
434 //Frees memory allocated in initCompPlan.
435 void freeCompPlan(compPlan *cP){
436 free(cP->numGroups);
437 free(cP->groupCountX);
438 free(cP->groupOff);
439 free(cP->qToGroup);
440 }
441
442
443 void computeNNs(matrix dx, matrix dq, intMatrix xmap, compPlan cP, int *qIDs, int *NNs, int compLength){
444 compPlan dcP;
445 real *dMins;
446 int *dMinIDs;
447
448 dcP.ld=cP.ld;
449 cudaMalloc((void**)&dcP.numGroups,cP.sNumGroups*sizeof(*dcP.numGroups));
450 cudaMalloc((void**)&dcP.groupCountX,cP.sGroupCountX*sizeof(*dcP.groupCountX));
451 cudaMalloc((void**)&dcP.groupOff,cP.sGroupOff*sizeof(*dcP.groupOff));
452 cudaMalloc((void**)&dcP.qToGroup,cP.sQToGroup*sizeof(*dcP.qToGroup));
453
454 cudaMemcpy(dcP.numGroups,cP.numGroups,cP.sNumGroups*sizeof(*dcP.numGroups),cudaMemcpyHostToDevice);
455 cudaMemcpy(dcP.groupCountX,cP.groupCountX,cP.sGroupCountX*sizeof(*dcP.groupCountX),cudaMemcpyHostToDevice);
456 cudaMemcpy(dcP.groupOff,cP.groupOff,cP.sGroupOff*sizeof(*dcP.groupOff),cudaMemcpyHostToDevice);
457 cudaMemcpy(dcP.qToGroup,cP.qToGroup,cP.sQToGroup*sizeof(*dcP.qToGroup),cudaMemcpyHostToDevice);
458
459 cudaMalloc((void**)&dMins,compLength*sizeof(*dMins));
460 cudaMalloc((void**)&dMinIDs,compLength*sizeof(*dMinIDs));
461
462 int *dqIDs;
463 cudaMalloc( (void**)&dqIDs, compLength*sizeof(*dqIDs) );
464 cudaMemcpy( dqIDs, qIDs, compLength*sizeof(*dqIDs), cudaMemcpyHostToDevice );
465
466
467 dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
468 dim3 dimGrid;
469 dimGrid.x = 1;
470 int numDone = 0;
471 while( numDone<compLength ){
472 int todo = MIN( (compLength-numDone) , MAX_BS*BLOCK_SIZE );
473 dimGrid.y = todo/BLOCK_SIZE;
474 planNNKernel<<<dimGrid,dimBlock>>>(dq,dx,dMins,dMinIDs,dcP,xmap,dqIDs,numDone);
475 cudaThreadSynchronize();
476 numDone += todo;
477 }
478
479 cudaMemcpy( NNs, dMinIDs, dq.r*sizeof(*NNs), cudaMemcpyDeviceToHost);
480
481
482 cudaFree(dcP.numGroups);
483 cudaFree(dcP.groupCountX);
484 cudaFree(dcP.groupOff);
485 cudaFree(dcP.qToGroup);
486 cudaFree(dMins);
487 cudaFree(dMinIDs);
488 cudaFree(dqIDs);
489 }
490
491
492 //This calls the dist1Kernel wrapper, but has it compute only
493 //a submatrix of the all-pairs distance matrix. In particular,
494 //only distances from dr[start,:].. dr[start+length-1] to all of x
495 //are computed, resulting in a distance matrix of size
496 //length by dx.pr. It is assumed that length is padded.
497 void distSubMat(matrix dr, matrix dx, matrix dD, int start, int length){
498 dr.r=dr.pr=length;
499 dr.mat = &dr.mat[IDX( start, 0, dr.ld )];
500 dist1Wrap(dr, dx, dD);
501 }
502
503
504 #endif