95aedd89c93208f9e2384403ae07a4c96a76cfbf
[RBC.git] / rbc.cu
1 #ifndef RBC_CU
2 #define RBC_CU
3
4 #include<sys/time.h>
5 #include<stdio.h>
6 #include<cuda.h>
7 #include "utils.h"
8 #include "defs.h"
9 #include "utilsGPU.h"
10 #include "rbc.h"
11 #include "kernels.h"
12 #include "kernelWrap.h"
13 #include "sKernel.h"
14
15 void rbc(matrix x, matrix q, int numReps, int s, int *NNs){
16 int i;
17 matrix r; // random subset of x
18 matrix dr; //device version of r
19 int *repIDsQ; //assigment of each pt to a rep.
20 real *radiiQ; //radius of each group
21 int *groupCountX, *groupCountQ; //num pts in each group
22 real *distToRepsQ; //distance of each pt to its rep
23 int *qID;
24 charMatrix cM; //compute matrix. will be treated as a binary matrix
25 intMatrix xmap;
26 compPlan cP;
27 int compLength;
28 struct timeval tvB, tvE;
29
30 //convenience variables
31 int n = x.r; //num pts in DB
32 int m = q.r; //num queries
33 int pnr = PAD(numReps);
34
35 r.r=numReps; r.c=x.c; r.pr=pnr; r.pc=PAD(r.c); r.ld=r.pc;
36 r.mat = (real*)calloc(r.pr*r.pc,sizeof(*(r.mat)));
37
38 repIDsQ = (int*)calloc(m,sizeof(*repIDsQ));
39 radiiQ = (real*)calloc(pnr,sizeof(*radiiQ));
40 groupCountX = (int*)calloc(pnr,sizeof(*groupCountX));
41 groupCountQ = (int*)calloc(pnr,sizeof(*groupCountQ));
42 distToRepsQ = (real*)calloc(m,sizeof(*distToRepsQ));
43
44 qID = (int*)calloc(PAD(m+(BLOCK_SIZE-1)*pnr),sizeof(*qID));
45
46 cM.mat = (char*)calloc(pnr*pnr,sizeof(*cM.mat));
47 cM.r=numReps; cM.c=numReps; cM.pr=pnr; cM.pc=pnr; cM.ld=cM.pc;
48
49 xmap.r=numReps; xmap.pr=PAD(numReps); xmap.c=s; xmap.pc=PAD(s); xmap.ld=xmap.pc;
50 cudaMalloc( (void**)&xmap.mat, xmap.pr*xmap.pc*sizeof(*xmap.mat) );
51
52 //The following lines of code init xmap to 0s.
53 intMatrix tempXmap;
54 tempXmap.r=xmap.r; tempXmap.pr=xmap.pr; tempXmap.c=xmap.c; tempXmap.pc=xmap.pc; tempXmap.ld=xmap.ld;
55 tempXmap.mat = (int*)calloc( tempXmap.pr*tempXmap.pc, sizeof(*tempXmap.mat) );
56 cudaMemcpy(xmap.mat, tempXmap.mat, xmap.pr*xmap.pc*sizeof(*xmap.mat), cudaMemcpyHostToDevice);
57 free(tempXmap.mat);
58
59
60 //Choose representatives and move them to device
61 int *randInds;
62 randInds = (int*)calloc(pnr,sizeof(*randInds));
63 subRandPerm(numReps,n,randInds);
64
65 for(i=0;i<numReps;i++){
66 copyVector(&r.mat[IDX(i,0,r.ld)], &x.mat[IDX(randInds[i],0,x.ld)], x.c);
67 }
68 free(randInds);
69
70 dr.r=r.r; dr.c=r.c; dr.pr=r.pr; dr.pc=r.pc; dr.ld=r.ld;
71 cudaMalloc( (void**)&(dr.mat), dr.pr*dr.pc*sizeof(*(dr.mat)) );
72 cudaMemcpy(dr.mat,r.mat,dr.pr*dr.pc*sizeof(*(dr.mat)),cudaMemcpyHostToDevice);
73
74
75 matrix dx;
76 copyAndMove(&dx, &x);
77
78 build(dx, dr, xmap, groupCountX, s);
79
80
81 gettimeofday(&tvB,NULL); //Start the timer for the queries
82
83 matrix dqOrig;
84 copyAndMove(&dqOrig, &q);
85
86 computeReps(dqOrig, dr, repIDsQ, distToRepsQ);
87
88 //How many points are assigned to each group?
89 computeCounts(repIDsQ, m, groupCountQ);
90
91 //Set up the mapping from groups to queries (qID).
92 buildQMap(q, qID, repIDsQ, numReps, &compLength);
93
94 // Determine which blocks need to be compared with one another and
95 // store the results in the computation matrix cM.
96 //blockIntersection(cM, dr, radiiX, radiiQ);
97 idIntersection(cM);
98
99 // Setup the compute plan
100 initCompPlan(&cP, cM, groupCountQ, groupCountX, numReps);
101
102 // Compute the NNs according to the compute plan
103 computeNNs(dx, dqOrig, xmap, cP, qID, NNs, compLength);
104
105 gettimeofday(&tvE,NULL);
106 printf("\t.. time elapsed (for queries) = %6.4f \n",timeDiff(tvB,tvE));
107
108 cudaFree(dr.mat);
109 cudaFree(dqOrig.mat);
110 freeCompPlan(&cP);
111 free(cM.mat);
112 free(distToRepsQ);
113 free(groupCountX);
114 free(groupCountQ);
115 free(qID);
116 free(radiiQ);
117 free(repIDsQ);
118 free(r.mat);
119 cudaFree(xmap.mat);
120 cudaFree(dx.mat);
121 }
122
123
124 void build(const matrix dx, const matrix dr, intMatrix xmap, int *counts, int s){
125
126 int n = dx.pr;
127 int p = dr.r;
128
129 //Figure out how much fits into memory
130 unsigned int memFree, memTot;
131 cuMemGetInfo(&memFree, &memTot);
132 memFree = (int)(((float)memFree)*MEM_USABLE);
133
134 //memory needed per rep = n*sizeof(real)+n*sizeof(char)+n*sizeof(int)+sizeof(real)+sizeof(int)
135 // = dist mat +ir +dSums +range +dCnts
136 int ptsAtOnce = DPAD(memFree/((n+1)*sizeof(real) + n*sizeof(char) +(n+1)*sizeof(int)));
137 if(ptsAtOnce==0){
138 fprintf(stderr,"memfree = %d \n",memFree);
139 fprintf(stderr,"error: not enough memory to build the RBC.. exiting\n");
140 exit(1);
141 }
142
143 matrix dD;
144 dD.pr=dD.r=ptsAtOnce; dD.c=dx.r; dD.pc=dx.pr; dD.ld=dD.pc;
145 cudaMalloc( (void**)&dD.mat, dD.pr*dD.pc*sizeof(*dD.mat) );
146
147 real *dranges;
148 cudaMalloc( (void**)&dranges, ptsAtOnce*sizeof(real) );
149
150 charMatrix ir;
151 ir.r=dD.r; ir.pr=dD.pr; ir.c=dD.c; ir.pc=dD.pc; ir.ld=dD.ld;
152 ir.mat = (char*)calloc( ir.pr*ir.pc, sizeof(*ir.mat) );
153 charMatrix dir;
154 copyAndMoveC(&dir, &ir);
155
156 intMatrix dSums; //used to compute memory addresses.
157 dSums.r=dir.r; dSums.pr=dir.pr; dSums.c=dir.c; dSums.pc=dir.pc; dSums.ld=dir.ld;
158 cudaMalloc( (void**)&dSums.mat, dSums.pc*dSums.pr*sizeof(*dSums.mat) );
159
160 int *dCnts;
161 cudaMalloc( (void**)&dCnts, ptsAtOnce*sizeof(*dCnts) );
162
163
164 int numits=0;
165 int numLeft = p; //points left to process
166 int row = 0; //base row for iteration of while loop
167 int pi, pip; //pi=pts per it, pip=pad(pi)
168
169 while( numLeft > 0 ){
170 numits++;
171 pi = MIN(ptsAtOnce, numLeft); //points to do this iteration.
172 pip = PAD(pi);
173 dD.r = pi; dD.pr = pip; dir.r=pi; dir.pr=pip; dSums.r=pi; dSums.pr=pip;
174
175 distSubMat(dr, dx, dD, row, pip); //compute the distance matrix
176 findRangeWrap(dD, dranges, s); //find an appropriate range
177 rangeSearchWrap(dD, dranges, dir); //set binary vector for points in range
178 printf("after range search\n");
179 sumWrap(dir, dSums); //This and the next call perform the parallel compaction.
180 buildMapWrap(xmap, dir, dSums, row);
181 getCountsWrap(dCnts,dir,dSums); //How many points are assigned to each rep? It is not
182 //*exactly* s, which is why we need to compute this.
183 cudaMemcpy( &counts[row], dCnts, pi*sizeof(*counts), cudaMemcpyDeviceToHost );
184
185 numLeft -= pi;
186 row += pi;
187 }
188
189 cudaFree(dCnts);
190 free(ir.mat);
191 cudaFree(dranges);
192 cudaFree(dir.mat);
193 cudaFree(dSums.mat);
194 cudaFree(dD.mat);
195 }
196
197
198 //Assign each point in dq to its nearest point in dr.
199 void computeReps(matrix dq, matrix dr, int *repIDs, real *distToReps){
200 real *dMins;
201 int *dMinIDs;
202
203 cudaMalloc((void**)&(dMins), dq.pr*sizeof(*dMins));
204 cudaMalloc((void**)&(dMinIDs), dq.pr*sizeof(*dMinIDs));
205
206 nnWrap(dq,dr,dMins,dMinIDs);
207
208 cudaMemcpy(distToReps,dMins,dq.r*sizeof(*dMins),cudaMemcpyDeviceToHost);
209 cudaMemcpy(repIDs,dMinIDs,dq.r*sizeof(*dMinIDs),cudaMemcpyDeviceToHost);
210
211 cudaFree(dMins);
212 cudaFree(dMinIDs);
213 }
214
215
216
217 //Assumes radii is initialized to 0s
218 void computeRadii(int *repIDs, real *distToReps, real *radii, int n, int numReps){
219 int i;
220
221 for(i=0;i<n;i++)
222 radii[repIDs[i]] = MAX(distToReps[i],radii[repIDs[i]]);
223 }
224
225
226 //Assumes groupCount is initialized to 0s
227 void computeCounts(int *repIDs, int n, int *groupCount){
228 int i;
229
230 for(i=0;i<n;i++)
231 groupCount[repIDs[i]]++;
232 }
233
234
235 // This function computes a cumulative sum of the groupCounts.
236 // Assumes groupOff is initialized to 0s.
237 void computeOffsets(int *groupCount, int n, int *groupOff){
238 int i;
239
240 for(i=1;i<n;i++)
241 groupOff[i]=groupOff[i-1]+PAD(groupCount[i-1]);
242 }
243
244
245 // This function sorts points by their repID. It makes two passes through the
246 // matrix x; one to count the bucket sizes, the next to place points in the
247 // correct bucket. Note that this function unfortunately allocates a temporary
248 // matrix the size of x, then copies the results over to x at the end. The
249 // sort could be done in place, eg by doing numReps passes through x instead of 2.
250 void groupPoints(matrix x, int *xID, int *repIDs, int numReps){
251 matrix y;
252 int n=x.r;
253 int d=x.c;
254 int i;
255 int *gS; //groupSize
256 int *yID;
257
258 yID = (int*)calloc(n,sizeof(*yID));
259 y.mat = (real*)calloc(n*d,sizeof(*y.mat));
260 gS = (int*)calloc(numReps+1,sizeof(*gS));
261
262 y.r=n; y.pr=n; y.c=d; y.pc=d; y.ld=d;
263
264 for(i=0;i<n;i++)
265 gS[repIDs[i]+1]++;
266 for(i=1;i<numReps;i++)
267 gS[i]=gS[i-1]+gS[i];
268
269 for(i=0;i<n;i++){
270 copyVector(&y.mat[IDX(gS[repIDs[i]],0,y.ld)], &x.mat[IDX(i,0,x.ld)],d);
271 yID[gS[repIDs[i]]]=xID[i];
272 gS[repIDs[i]]++;
273 }
274
275 for(i=0;i<n;i++){
276 copyVector(&x.mat[IDX(i,0,x.ld)],&y.mat[IDX(i,0,y.ld)],d);
277 xID[i]=yID[i];
278 }
279
280 free(yID);
281 free(gS);
282 free(y.mat);
283 }
284
285
286 void buildQMap(matrix q, int *qID, int *repIDs, int numReps, int *compLength){
287 int n=q.r;
288 int i;
289 int *gS; //groupSize
290
291 gS = (int*)calloc(numReps+1,sizeof(*gS));
292
293 for( i=0; i<n; i++ )
294 gS[repIDs[i]+1]++;
295 for( i=0; i<numReps+1; i++ )
296 gS[i]=PAD(gS[i]);
297
298 for( i=1; i<numReps+1; i++ )
299 gS[i]=gS[i-1]+gS[i];
300
301 *compLength = gS[numReps];
302
303 for( i=0; i<(*compLength); i++ )
304 qID[i]=DUMMY_IDX;
305
306 for( i=0; i<n; i++ ){
307 qID[gS[repIDs[i]]]=i;
308 gS[repIDs[i]]++;
309 }
310
311 free(gS);
312 }
313
314
315 void blockIntersection(charMatrix cM, matrix dr, real *radiiX, real *radiiQ){
316 matrix dD;
317 real *dradiiX, *dradiiQ;
318 int pnR = dr.pr;
319 charMatrix dcM;
320
321 dD.r=dD.c=dr.r; dD.pr=dD.pc=dD.ld=dr.pr;
322 dcM.r=cM.r; dcM.c=cM.c; dcM.pr=cM.pr; dcM.pc=cM.pc; dcM.ld=cM.ld;
323
324 cudaMalloc((void**)&dD.mat, pnR*pnR*sizeof(*dD.mat));
325 cudaMalloc((void**)&dradiiX, pnR*sizeof(*dradiiX));
326 cudaMalloc((void**)&dradiiQ, pnR*sizeof(*dradiiQ));
327 cudaMalloc((void**)&dcM.mat, dcM.pr*dcM.pc*sizeof(*dcM.mat));
328
329 // Copying over the radii. Note that everything after the first dr.r places
330 // on the device variables is undefined.
331 cudaMemcpy(dradiiX,radiiX,dr.r*sizeof(*dradiiX),cudaMemcpyHostToDevice);
332 cudaMemcpy(dradiiQ,radiiQ,dr.r*sizeof(*dradiiQ),cudaMemcpyHostToDevice);
333
334 dist1Wrap(dr, dr, dD);
335 pruneWrap(dcM, dD, dradiiX, dradiiQ);
336
337 cudaMemcpy(cM.mat,dcM.mat,pnR*pnR*sizeof(*dcM.mat),cudaMemcpyDeviceToHost);
338
339 cudaFree(dcM.mat);
340 cudaFree(dradiiQ);
341 cudaFree(dradiiX);
342 cudaFree(dD.mat);
343 }
344
345
346 // Sets the computation matrix to the identity.
347 void idIntersection(charMatrix cM){
348 int i;
349 for(i=0;i<cM.r;i++){
350 if(i<cM.c)
351 cM.mat[IDX(i,i,cM.ld)]=1;
352 }
353 }
354
355
356
357 void fullIntersection(charMatrix cM){
358 int i,j;
359 for(i=0;i<cM.r;i++){
360 for(j=0;j<cM.c;j++){
361 cM.mat[IDX(i,j,cM.ld)]=1;
362 }
363 }
364 }
365
366
367 // Danger: this function allocates memory that it does not free.
368 // Use freeCompPlan to clear mem.
369 void initCompPlan(compPlan *cP, charMatrix cM, int *groupCountQ, int *groupCountX, int numReps){
370 int i,j,k;
371 int maxNumGroups=0;
372 int *groupOff;
373
374 groupOff = (int*)calloc(numReps,sizeof(*groupOff));
375
376 cP->sNumGroups=numReps;
377 cP->numGroups = (int*)calloc(cP->sNumGroups,sizeof(*cP->numGroups));
378
379 for(i=0;i<numReps;i++){
380 cP->numGroups[i] = 0;
381 for(j=0;j<numReps;j++){
382 cP->numGroups[i] += cM.mat[IDX(i,j,cM.ld)];
383 }
384 maxNumGroups = MAX(cP->numGroups[i],maxNumGroups);
385 }
386
387 cP->ld = maxNumGroups;
388
389 cP->sGroupCountX = maxNumGroups*numReps;
390 cP->groupCountX = (int*)calloc(cP->sGroupCountX,sizeof(*cP->groupCountX));
391 cP->sGroupOff = maxNumGroups*numReps;
392 cP->groupOff = (int*)calloc(cP->sGroupOff,sizeof(*cP->groupOff));
393
394 computeOffsets(groupCountX,numReps,groupOff);
395
396 int tempSize=0;
397 for(i=0;i<numReps;i++)
398 tempSize+=PAD(groupCountQ[i]);
399
400 cP->sQToGroup = tempSize;
401 cP->qToGroup = (int*)calloc(cP->sQToGroup,sizeof(*cP->qToGroup));
402
403 for(i=0, k=0;i<numReps;i++){
404 for(j=0;j<PAD(groupCountQ[i]);j++){
405 cP->qToGroup[k]=i;
406 k++;
407 }
408 }
409
410 for(i=0;i<numReps;i++){
411 for(j=0, k=0;j<numReps;j++){
412 if(cM.mat[IDX(i,j,cM.ld)]){
413 cP->groupCountX[IDX(i,k,cP->ld)]=groupCountX[j];
414 cP->groupOff[IDX(i,k,cP->ld)]=groupOff[j];
415 k++;
416 }
417 }
418 }
419 free(groupOff);
420 }
421
422
423 //Frees memory allocated in initCompPlan.
424 void freeCompPlan(compPlan *cP){
425 free(cP->numGroups);
426 free(cP->groupCountX);
427 free(cP->groupOff);
428 free(cP->qToGroup);
429 }
430
431
432 void computeNNs(matrix dx, matrix dq, intMatrix xmap, compPlan cP, int *qIDs, int *NNs, int compLength){
433 compPlan dcP;
434 real *dMins;
435 int *dMinIDs;
436
437 dcP.ld=cP.ld;
438 cudaMalloc((void**)&dcP.numGroups,cP.sNumGroups*sizeof(*dcP.numGroups));
439 cudaMalloc((void**)&dcP.groupCountX,cP.sGroupCountX*sizeof(*dcP.groupCountX));
440 cudaMalloc((void**)&dcP.groupOff,cP.sGroupOff*sizeof(*dcP.groupOff));
441 cudaMalloc((void**)&dcP.qToGroup,cP.sQToGroup*sizeof(*dcP.qToGroup));
442
443 cudaMemcpy(dcP.numGroups,cP.numGroups,cP.sNumGroups*sizeof(*dcP.numGroups),cudaMemcpyHostToDevice);
444 cudaMemcpy(dcP.groupCountX,cP.groupCountX,cP.sGroupCountX*sizeof(*dcP.groupCountX),cudaMemcpyHostToDevice);
445 cudaMemcpy(dcP.groupOff,cP.groupOff,cP.sGroupOff*sizeof(*dcP.groupOff),cudaMemcpyHostToDevice);
446 cudaMemcpy(dcP.qToGroup,cP.qToGroup,cP.sQToGroup*sizeof(*dcP.qToGroup),cudaMemcpyHostToDevice);
447
448 cudaMalloc((void**)&dMins,compLength*sizeof(*dMins));
449 cudaMalloc((void**)&dMinIDs,compLength*sizeof(*dMinIDs));
450
451 int *dqIDs;
452 cudaMalloc( (void**)&dqIDs, compLength*sizeof(*dqIDs) );
453 cudaMemcpy( dqIDs, qIDs, compLength*sizeof(*dqIDs), cudaMemcpyHostToDevice );
454
455
456 dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
457 dim3 dimGrid;
458 dimGrid.x = 1;
459 int numDone = 0;
460 while( numDone<compLength ){
461 int todo = MIN( (compLength-numDone) , MAX_BS*BLOCK_SIZE );
462 dimGrid.y = todo/BLOCK_SIZE;
463 planNNKernel<<<dimGrid,dimBlock>>>(dq,dx,dMins,dMinIDs,dcP,xmap,dqIDs,numDone);
464 cudaThreadSynchronize();
465 numDone += todo;
466 }
467
468 cudaMemcpy( NNs, dMinIDs, dq.r*sizeof(*NNs), cudaMemcpyDeviceToHost);
469
470
471 cudaFree(dcP.numGroups);
472 cudaFree(dcP.groupCountX);
473 cudaFree(dcP.groupOff);
474 cudaFree(dcP.qToGroup);
475 cudaFree(dMins);
476 cudaFree(dMinIDs);
477 cudaFree(dqIDs);
478 }
479
480
481 //This calls the dist1Kernel wrapper, but has it compute only
482 //a submatrix of the all-pairs distance matrix. In particular,
483 //only distances from dr[start,:].. dr[start+length-1] to all of x
484 //are computed, resulting in a distance matrix of size
485 //length by dx.pr. It is assumed that length is padded.
486 void distSubMat(matrix dr, matrix dx, matrix dD, int start, int length){
487 dr.r=dr.pr=length;
488 dr.mat = &dr.mat[IDX( start, 0, dr.ld )];
489 dist1Wrap(dr, dx, dD);
490 }
491
492
493 #endif