improved driver
[RBC.git] / rbc.cu
1 /* This file is part of the Random Ball Cover (RBC) library.
2 * (C) Copyright 2010, Lawrence Cayton [lcayton@tuebingen.mpg.de]
3 */
4
5 #ifndef RBC_CU
6 #define RBC_CU
7
8 #include<sys/time.h>
9 #include<stdio.h>
10 #include<cuda.h>
11 #include "utils.h"
12 #include "defs.h"
13 #include "utilsGPU.h"
14 #include "rbc.h"
15 #include "kernels.h"
16 #include "kernelWrap.h"
17 #include "sKernelWrap.h"
18
19 void queryRBC(const matrix q, const rbcStruct rbcS, unint *NNs, real* NNdists){
20 unint m = q.r;
21 unint numReps = rbcS.dr.r;
22 unint compLength;
23 compPlan dcP;
24 unint *qMap, *dqMap;
25 qMap = (unint*)calloc(PAD(m+(BLOCK_SIZE-1)*PAD(numReps)),sizeof(*qMap));
26 matrix dq;
27 copyAndMove(&dq, &q);
28
29 charMatrix cM;
30 cM.r=cM.c=numReps; cM.pr=cM.pc=cM.ld=PAD(numReps);
31 cM.mat = (char*)calloc( cM.pr*cM.pc, sizeof(*cM.mat) );
32
33 unint *repIDsQ;
34 repIDsQ = (unint*)calloc( m, sizeof(*repIDsQ) );
35 real *distToRepsQ;
36 distToRepsQ = (real*)calloc( m, sizeof(*distToRepsQ) );
37 unint *groupCountQ;
38 groupCountQ = (unint*)calloc( PAD(numReps), sizeof(*groupCountQ) );
39
40 computeReps(dq, rbcS.dr, repIDsQ, distToRepsQ);
41
42 //How many points are assigned to each group?
43 computeCounts(repIDsQ, m, groupCountQ);
44
45 //Set up the mapping from groups to queries (qMap).
46 buildQMap(q, qMap, repIDsQ, numReps, &compLength);
47
48 // Setup the computation matrix. Currently, the computation matrix is
49 // just the identity matrix: each query assigned to a particular
50 // representative is compared only to that representative's points.
51 idIntersection(cM);
52
53 initCompPlan(&dcP, cM, groupCountQ, rbcS.groupCount, numReps);
54
55 checkErr( cudaMalloc( (void**)&dqMap, compLength*sizeof(*dqMap) ) );
56 cudaMemcpy( dqMap, qMap, compLength*sizeof(*dqMap), cudaMemcpyHostToDevice );
57
58 computeNNs(rbcS.dx, rbcS.dxMap, dq, dqMap, dcP, NNs, NNdists, compLength);
59
60 free(qMap);
61 cudaFree(dqMap);
62 freeCompPlan(&dcP);
63 cudaFree(dq.mat);
64 free(cM.mat);
65 free(repIDsQ);
66 free(distToRepsQ);
67 free(groupCountQ);
68 }
69
70 //This function is very similar to queryRBC, with a couple of basic changes to handle
71 //k-nn.
72 void kqueryRBC(const matrix q, const rbcStruct rbcS, intMatrix NNs, matrix NNdists){
73 unint m = q.r;
74 unint numReps = rbcS.dr.r;
75 unint compLength;
76 compPlan dcP;
77 unint *qMap, *dqMap;
78 qMap = (unint*)calloc(PAD(m+(BLOCK_SIZE-1)*PAD(numReps)),sizeof(*qMap));
79 matrix dq;
80 copyAndMove(&dq, &q);
81
82 charMatrix cM;
83 cM.r=cM.c=numReps; cM.pr=cM.pc=cM.ld=PAD(numReps);
84 cM.mat = (char*)calloc( cM.pr*cM.pc, sizeof(*cM.mat) );
85
86 unint *repIDsQ;
87 repIDsQ = (unint*)calloc( m, sizeof(*repIDsQ) );
88 real *distToRepsQ;
89 distToRepsQ = (real*)calloc( m, sizeof(*distToRepsQ) );
90 unint *groupCountQ;
91 groupCountQ = (unint*)calloc( PAD(numReps), sizeof(*groupCountQ) );
92
93 computeReps(dq, rbcS.dr, repIDsQ, distToRepsQ);
94
95 //How many points are assigned to each group?
96 computeCounts(repIDsQ, m, groupCountQ);
97
98 //Set up the mapping from groups to queries (qMap).
99 buildQMap(q, qMap, repIDsQ, numReps, &compLength);
100
101 // Setup the computation matrix. Currently, the computation matrix is
102 // just the identity matrix: each query assigned to a particular
103 // representative is compared only to that representative's points.
104
105 // NOTE: currently, idIntersection is the *only* computation matrix
106 // that will work properly with k-nn search (this is not true for 1-nn above).
107 idIntersection(cM);
108
109 initCompPlan(&dcP, cM, groupCountQ, rbcS.groupCount, numReps);
110
111 checkErr( cudaMalloc( (void**)&dqMap, compLength*sizeof(*dqMap) ) );
112 cudaMemcpy( dqMap, qMap, compLength*sizeof(*dqMap), cudaMemcpyHostToDevice );
113
114 computeKNNs(rbcS.dx, rbcS.dxMap, dq, dqMap, dcP, NNs, NNdists, compLength);
115
116 free(qMap);
117 freeCompPlan(&dcP);
118 cudaFree(dq.mat);
119 free(cM.mat);
120 free(repIDsQ);
121 free(distToRepsQ);
122 free(groupCountQ);
123 }
124
125
126 void buildRBC(const matrix x, rbcStruct *rbcS, unint numReps, unint s){
127 unint n = x.pr;
128 intMatrix xmap;
129
130 setupReps(x, rbcS, numReps);
131 copyAndMove(&rbcS->dx, &x);
132
133 xmap.r=numReps; xmap.pr=PAD(numReps); xmap.c=s; xmap.pc=xmap.ld=PAD(s);
134 xmap.mat = (unint*)calloc( xmap.pr*xmap.pc, sizeof(*xmap.mat) );
135 copyAndMoveI(&rbcS->dxMap, &xmap);
136 rbcS->groupCount = (uint*)calloc( PAD(numReps), sizeof(*rbcS->groupCount) );
137
138 //Figure out how much fits into memory
139 size_t memFree, memTot;
140 cudaMemGetInfo(&memFree, &memTot);
141 memFree = (unint)(((float)memFree)*MEM_USABLE);
142 /* mem needed per rep:
143 * n*sizeof(real) - dist mat
144 * n*sizeof(char) - dir
145 * n*sizeof(int) - dSums
146 * sizeof(real) - dranges
147 * sizeof(int) - dCnts
148 * MEM_USED_IN_SCAN - memory used internally
149 */
150 unint ptsAtOnce = DPAD(memFree/((n+1)*sizeof(real) + n*sizeof(char) + (n+1)*sizeof(unint) + 2*MEM_USED_IN_SCAN(n)));
151 if(!ptsAtOnce){
152 fprintf(stderr,"error: %lu is not enough memory to build the RBC.. exiting\n", (unsigned long)memFree);
153 exit(1);
154 }
155
156 //Now set everything up for the scans
157 matrix dD;
158 dD.pr=dD.r=ptsAtOnce; dD.c=rbcS->dx.r; dD.pc=rbcS->dx.pr; dD.ld=dD.pc;
159 checkErr( cudaMalloc( (void**)&dD.mat, dD.pr*dD.pc*sizeof(*dD.mat) ) );
160
161 real *dranges;
162 checkErr( cudaMalloc( (void**)&dranges, ptsAtOnce*sizeof(real) ) );
163
164 charMatrix ir;
165 ir.r=dD.r; ir.pr=dD.pr; ir.c=dD.c; ir.pc=dD.pc; ir.ld=dD.ld;
166 ir.mat = (char*)calloc( ir.pr*ir.pc, sizeof(*ir.mat) );
167 charMatrix dir;
168 copyAndMoveC(&dir, &ir);
169
170 intMatrix dSums; //used to compute memory addresses.
171 dSums.r=dir.r; dSums.pr=dir.pr; dSums.c=dir.c; dSums.pc=dir.pc; dSums.ld=dir.ld;
172 checkErr( cudaMalloc( (void**)&dSums.mat, dSums.pc*dSums.pr*sizeof(*dSums.mat) ) );
173
174 unint *dCnts;
175 checkErr( cudaMalloc( (void**)&dCnts, ptsAtOnce*sizeof(*dCnts) ) );
176
177 //Do the scans to build the dxMap
178 unint numLeft = rbcS->dr.r; //points left to process
179 unint row = 0; //base row for iteration of while loop
180 unint pi, pip; //pi=pts per it, pip=pad(pi)
181 while( numLeft > 0 ){
182 pi = MIN(ptsAtOnce, numLeft); //points to do this iteration.
183 pip = PAD(pi);
184 dD.r = pi; dD.pr = pip; dir.r=pi; dir.pr=pip; dSums.r=pi; dSums.pr=pip;
185
186 distSubMat(rbcS->dr, rbcS->dx, dD, row, pip); //compute the distance matrix
187 findRangeWrap(dD, dranges, s); //find an appropriate range
188 rangeSearchWrap(dD, dranges, dir); //set binary vector for points in range
189 sumWrap(dir, dSums); //This and the next call perform the parallel compaction.
190 buildMapWrap(rbcS->dxMap, dir, dSums, row);
191 getCountsWrap(dCnts,dir,dSums); //How many points are assigned to each rep? It is not
192 //*exactly* s, which is why we need to compute this.
193 cudaMemcpy( &rbcS->groupCount[row], dCnts, pi*sizeof(*rbcS->groupCount), cudaMemcpyDeviceToHost );
194
195 numLeft -= pi;
196 row += pi;
197 }
198
199 cudaFree(dCnts);
200 free(ir.mat);
201 free(xmap.mat);
202 cudaFree(dranges);
203 cudaFree(dir.mat);
204 cudaFree(dSums.mat);
205 cudaFree(dD.mat);
206 }
207
208
209 // Choose representatives and move them to device
210 void setupReps(matrix x, rbcStruct *rbcS, int numReps){
211 unint i;
212 unint *randInds;
213 randInds = (unint*)calloc( PAD(numReps), sizeof(*randInds) );
214 subRandPerm(numReps, x.r, randInds);
215
216 matrix r;
217 r.r=numReps; r.pr=PAD(numReps); r.c=x.c; r.pc=r.ld=PAD(r.c);
218 r.mat = (real*)calloc( r.pr*r.pc, sizeof(*r.mat) );
219
220 for(i=0;i<numReps;i++)
221 copyVector(&r.mat[IDX(i,0,r.ld)], &x.mat[IDX(randInds[i],0,x.ld)], x.c);
222
223 copyAndMove(&rbcS->dr, &r);
224
225 free(randInds);
226 free(r.mat);
227 }
228
229
230 //Assign each point in dq to its nearest point in dr.
231 void computeReps(matrix dq, matrix dr, unint *repIDs, real *distToReps){
232 real *dMins;
233 unint *dMinIDs;
234
235 checkErr( cudaMalloc((void**)&(dMins), dq.pr*sizeof(*dMins)) );
236 checkErr( cudaMalloc((void**)&(dMinIDs), dq.pr*sizeof(*dMinIDs)) );
237
238 nnWrap(dq,dr,dMins,dMinIDs);
239
240 cudaMemcpy(distToReps,dMins,dq.r*sizeof(*dMins),cudaMemcpyDeviceToHost);
241 cudaMemcpy(repIDs,dMinIDs,dq.r*sizeof(*dMinIDs),cudaMemcpyDeviceToHost);
242
243 cudaFree(dMins);
244 cudaFree(dMinIDs);
245 }
246
247
248 //Assumes radii is initialized to 0s
249 void computeRadii(unint *repIDs, real *distToReps, real *radii, unint n, unint numReps){
250 unint i;
251
252 for(i=0;i<n;i++)
253 radii[repIDs[i]] = MAX(distToReps[i],radii[repIDs[i]]);
254 }
255
256
257 //Assumes groupCount is initialized to 0s
258 void computeCounts(unint *repIDs, unint n, unint *groupCount){
259 unint i;
260
261 for(i=0;i<n;i++)
262 groupCount[repIDs[i]]++;
263 }
264
265
266 void buildQMap(matrix q, unint *qMap, unint *repIDs, unint numReps, unint *compLength){
267 unint n=q.r;
268 unint i;
269 unint *gS; //groupSize
270
271 gS = (unint*)calloc(numReps+1,sizeof(*gS));
272
273 for( i=0; i<n; i++ )
274 gS[repIDs[i]+1]++;
275 for( i=0; i<numReps+1; i++ )
276 gS[i]=PAD(gS[i]);
277
278 for( i=1; i<numReps+1; i++ )
279 gS[i]=gS[i-1]+gS[i];
280
281 *compLength = gS[numReps];
282
283 for( i=0; i<(*compLength); i++ )
284 qMap[i]=DUMMY_IDX;
285
286 for( i=0; i<n; i++ ){
287 qMap[gS[repIDs[i]]]=i;
288 gS[repIDs[i]]++;
289 }
290
291 free(gS);
292 }
293
294
295 // Sets the computation matrix to the identity.
296 void idIntersection(charMatrix cM){
297 unint i;
298 for(i=0;i<cM.r;i++){
299 if(i<cM.c)
300 cM.mat[IDX(i,i,cM.ld)]=1;
301 }
302 }
303
304
305 void fullIntersection(charMatrix cM){
306 unint i,j;
307 for(i=0;i<cM.r;i++){
308 for(j=0;j<cM.c;j++){
309 cM.mat[IDX(i,j,cM.ld)]=1;
310 }
311 }
312 }
313
314
315 void computeNNs(matrix dx, intMatrix dxMap, matrix dq, unint *dqMap, compPlan dcP, unint *NNs, real *NNdists, unint compLength){
316 real *dNNdists;
317 unint *dMinIDs;
318
319 checkErr( cudaMalloc((void**)&dNNdists,compLength*sizeof(*dNNdists)) );
320 checkErr( cudaMalloc((void**)&dMinIDs,compLength*sizeof(*dMinIDs)) );
321
322 planNNWrap(dq, dqMap, dx, dxMap, dNNdists, dMinIDs, dcP, compLength );
323 cudaMemcpy( NNs, dMinIDs, dq.r*sizeof(*NNs), cudaMemcpyDeviceToHost );
324 cudaMemcpy( NNdists, dNNdists, dq.r*sizeof(*dNNdists), cudaMemcpyDeviceToHost );
325
326 cudaFree(dNNdists);
327 cudaFree(dMinIDs);
328 }
329
330
331 void computeKNNs(matrix dx, intMatrix dxMap, matrix dq, unint *dqMap, compPlan dcP, intMatrix NNs, matrix NNdists, unint compLength){
332 matrix dNNdists;
333 intMatrix dMinIDs;
334 dNNdists.r=compLength; dNNdists.pr=compLength; dNNdists.c=K; dNNdists.pc=K; dNNdists.ld=dNNdists.pc;
335 dMinIDs.r=compLength; dMinIDs.pr=compLength; dMinIDs.c=K; dMinIDs.pc=K; dMinIDs.ld=dMinIDs.pc;
336
337 checkErr( cudaMalloc((void**)&dNNdists.mat,dNNdists.pr*dNNdists.pc*sizeof(*dNNdists.mat)) );
338 checkErr( cudaMalloc((void**)&dMinIDs.mat,dMinIDs.pr*dMinIDs.pc*sizeof(*dMinIDs.mat)) );
339
340 planKNNWrap(dq, dqMap, dx, dxMap, dNNdists, dMinIDs, dcP, compLength);
341 cudaMemcpy( NNs.mat, dMinIDs.mat, dq.r*K*sizeof(*NNs.mat), cudaMemcpyDeviceToHost );
342 cudaMemcpy( NNdists.mat, dNNdists.mat, dq.r*K*sizeof(*NNdists.mat), cudaMemcpyDeviceToHost );
343
344 cudaFree(dNNdists.mat);
345 cudaFree(dMinIDs.mat);
346 }
347
348
349 //This calls the dist1Kernel wrapper, but has it compute only
350 //a submatrix of the all-pairs distance matrix. In particular,
351 //only distances from dr[start,:].. dr[start+length-1] to all of x
352 //are computed, resulting in a distance matrix of size
353 //length by dx.pr. It is assumed that length is padded.
354 void distSubMat(matrix dr, matrix dx, matrix dD, unint start, unint length){
355 dr.r=dr.pr=length;
356 dr.mat = &dr.mat[IDX( start, 0, dr.ld )];
357 dist1Wrap(dr, dx, dD);
358 }
359
360
361 void destroyRBC(rbcStruct *rbcS){
362 cudaFree(rbcS->dx.mat);
363 cudaFree(rbcS->dxMap.mat);
364 cudaFree(rbcS->dr.mat);
365 free(rbcS->groupCount);
366 }
367
368
369 /* Danger: this function allocates memory that it does not free.
370 * Use freeCompPlan to clear mem.
371 * See the readme.txt file for a description of why this function is needed.
372 */
373 void initCompPlan(compPlan *dcP, charMatrix cM, unint *groupCountQ, unint *groupCountX, unint numReps){
374 unint i,j,k;
375 unint maxNumGroups=0;
376 compPlan cP;
377
378 unint sNumGroups = numReps;
379 cP.numGroups = (unint*)calloc(sNumGroups, sizeof(*cP.numGroups));
380
381 for(i=0; i<numReps; i++){
382 cP.numGroups[i] = 0;
383 for(j=0; j<numReps; j++)
384 cP.numGroups[i] += cM.mat[IDX(i,j,cM.ld)];
385 maxNumGroups = MAX(cP.numGroups[i], maxNumGroups);
386 }
387 cP.ld = maxNumGroups;
388
389 unint sQToQGroup;
390 for(i=0, sQToQGroup=0; i<numReps; i++)
391 sQToQGroup += PAD(groupCountQ[i]);
392
393 cP.qToQGroup = (unint*)calloc( sQToQGroup, sizeof(*cP.qToQGroup) );
394
395 for(i=0, k=0; i<numReps; i++){
396 for(j=0; j<PAD(groupCountQ[i]); j++)
397 cP.qToQGroup[k++] = i;
398 }
399
400 unint sQGroupToXGroup = numReps*maxNumGroups;
401 cP.qGroupToXGroup = (unint*)calloc( sQGroupToXGroup, sizeof(*cP.qGroupToXGroup) );
402 unint sGroupCountX = maxNumGroups*numReps;
403 cP.groupCountX = (unint*)calloc( sGroupCountX, sizeof(*cP.groupCountX) );
404
405 for(i=0; i<numReps; i++){
406 for(j=0, k=0; j<numReps; j++){
407 if( cM.mat[IDX( i, j, cM.ld )] ){
408 cP.qGroupToXGroup[IDX( i, k, cP.ld )] = j;
409 cP.groupCountX[IDX( i, k++, cP.ld )] = groupCountX[j];
410 }
411 }
412 }
413
414 //Move to device
415 checkErr( cudaMalloc( (void**)&dcP->numGroups, sNumGroups*sizeof(*dcP->numGroups) ) );
416 cudaMemcpy( dcP->numGroups, cP.numGroups, sNumGroups*sizeof(*dcP->numGroups), cudaMemcpyHostToDevice );
417 checkErr( cudaMalloc( (void**)&dcP->groupCountX, sGroupCountX*sizeof(*dcP->groupCountX) ) );
418 cudaMemcpy( dcP->groupCountX, cP.groupCountX, sGroupCountX*sizeof(*dcP->groupCountX), cudaMemcpyHostToDevice );
419 checkErr( cudaMalloc( (void**)&dcP->qToQGroup, sQToQGroup*sizeof(*dcP->qToQGroup) ) );
420 cudaMemcpy( dcP->qToQGroup, cP.qToQGroup, sQToQGroup*sizeof(*dcP->qToQGroup), cudaMemcpyHostToDevice );
421 checkErr( cudaMalloc( (void**)&dcP->qGroupToXGroup, sQGroupToXGroup*sizeof(*dcP->qGroupToXGroup) ) );
422 cudaMemcpy( dcP->qGroupToXGroup, cP.qGroupToXGroup, sQGroupToXGroup*sizeof(*dcP->qGroupToXGroup), cudaMemcpyHostToDevice );
423 dcP->ld = cP.ld;
424
425 free(cP.numGroups);
426 free(cP.groupCountX);
427 free(cP.qToQGroup);
428 free(cP.qGroupToXGroup);
429 }
430
431
432 //Frees memory allocated in initCompPlan.
433 void freeCompPlan(compPlan *dcP){
434 cudaFree(dcP->numGroups);
435 cudaFree(dcP->groupCountX);
436 cudaFree(dcP->qToQGroup);
437 cudaFree(dcP->qGroupToXGroup);
438 }
439
440 #endif