minor bug fixes, updated readme
[RBC.git] / deprecated.cu
1 /* This file contains functions that were in use at one point but
2 are not currently used. As far as I know, everything here is
3 debugged, and so can be plugged in reasonably safely.
4 */
5
6
7
8 /* This is the more complex version of computeReps. It can be used even if X doesn't
9 fit into the device memory. It does not work in emulation mode, because it checks to see
10 how much mem is available on the device. Thus for debugging purposes we currently
11 use a simpler version of computeReps. */
12
13 //Assumes that dr is a matrix already on the device
14 void computeReps(matrix x, matrix dr, int *repIDs, real *distToReps){
15 int memPerX, segSize; //seg==segment
16 int index, tempSize; //temp variables used in the loop
17 int i;
18 matrix dx;
19 real *dMins;
20 int *dMinIDs;
21 memPlan mp;
22
23 int n = x.r; //For convenience
24
25 // Items that need to go on device: x, repIDs, distToReps. The "+1" is for the
26 // distance from each point to its nearest rep (distToReps) and the int is for
27 // the ID (repIDs).
28 memPerX = (x.pc+1)*sizeof(real)+sizeof(int);
29 mp = createMemPlan(x.r,memPerX);
30
31 for(i=0;i<mp.numComputeSegs;i++){
32 if(i==mp.numComputeSegs-1)
33 segSize = mp.lastSegSize;
34 else
35 segSize = mp.normSegSize;
36
37 //Allocate & copy over data
38 index = IDX(mp.normSegSize*i,0,x.ld);
39 tempSize = segSize*x.pc*sizeof(*(dx.mat));
40
41 cudaMalloc((void**)&(dx.mat),tempSize);
42 cudaMemcpy(dx.mat,&(x.mat[index]),tempSize,cudaMemcpyHostToDevice);
43 dx.r=segSize; dx.c=x.c; dx.pr=dx.r; dx.pc=x.pc; dx.ld=x.ld;
44
45 //Allocate matrices to temporarily store mins and IDs (NOTE:MOVE OUT OF LOOP FOR EFFICIENCY)
46 cudaMalloc((void**)&(dMins), PAD(MIN(segSize,n))*sizeof(*dMins));
47 cudaMalloc((void**)&(dMinIDs), PAD(MIN(segSize,n))*sizeof(*dMinIDs));
48 nnWrap(dx,dr,dMins,dMinIDs);
49
50 cudaMemcpy(&distToReps[i*segSize],dMins,MIN(segSize,n)*sizeof(*dMins),cudaMemcpyDeviceToHost);
51 cudaMemcpy(&repIDs[i*segSize],dMinIDs,MIN(segSize,n)*sizeof(*dMinIDs),cudaMemcpyDeviceToHost);
52
53 cudaFree(dMins);
54 cudaFree(dMinIDs);
55 cudaFree(dx.mat);
56 }
57 }
58
59
60 __global__ void getMinsKernel(matrix,real*,int*);
61
62 // Returns the min of each row of D. dMins and dMinIDs
63 // are assumed to be (at least) of size D.r.
64 __global__ void getMinsKernel(const matrix D, real *dMins, int *dMinIDs){
65 int row, locRow, colOff, i, curCol;
66 real temp;
67
68 row = blockIdx.y*BLOCK_SIZE+threadIdx.y;
69 locRow = threadIdx.y;
70
71 colOff = threadIdx.x; //column offset of this thread
72
73 __shared__ float mins[BLOCK_SIZE][BLOCK_SIZE];
74 __shared__ int pos[BLOCK_SIZE][BLOCK_SIZE];
75
76
77 // This loop finds the minimum of cols
78 // [colOff, BLOCK_SIZE+colOff, 2*BLOCK_SIZE+colOff,...]
79 // and stores it in mins[locRow][colOff].
80 mins[locRow][colOff]=MAX_REAL;
81 pos[locRow][colOff]=-1;
82 for (i=0;i<D.pc/BLOCK_SIZE;i++){
83 curCol = i*BLOCK_SIZE+colOff;
84 if(curCol < D.c){ //ignore padding
85 temp = D.mat[IDX(row,curCol,D.ld)];
86 if(temp<mins[locRow][colOff]){
87 mins[locRow][colOff]=temp;
88 pos[locRow][colOff]=curCol;
89 }
90 }
91 }
92 __syncthreads();
93
94 //Now find the min of cols [0, ... , BLOCK_SIZE]
95 for (i=BLOCK_SIZE/2; i>0;i/=2){
96 if(colOff<i){
97 //compare (col) to (col+i)
98 if(mins[locRow][colOff]>mins[locRow][colOff+i]){
99 mins[locRow][colOff]=mins[locRow][colOff+i];
100 pos[locRow][colOff]=pos[locRow][colOff+i];
101 }
102 }
103 __syncthreads();
104 }
105
106 //arbitrarily use the first thread (along x) to set memory
107 if(threadIdx.x==0){
108 dMins[row] = mins[locRow][0];
109 dMinIDs[row] = pos[locRow][0];
110 }
111 }
112
113
114 // Returns the min of each row of D. dMins and dMinIDs
115 // are assumed to be (at least) of size D.r.
116 __global__ void getKMinsKernel(matrix D, matrix dMins, intMatrix NNs, int k){
117 int row, locRow, colOff, i, curCol,j;
118 real temp;
119
120 row = blockIdx.y*BLOCK_SIZE+threadIdx.y;
121 locRow = threadIdx.y;
122
123 //printf("row=%d D.r =%d \n",row,D.r);
124 /* if(row>=D.r) */
125 /* return; */
126
127 colOff = threadIdx.x; //column offset of this thread
128
129 __shared__ float mins[BLOCK_SIZE][BLOCK_SIZE];
130 __shared__ int pos[BLOCK_SIZE][BLOCK_SIZE];
131
132 for(i=0;i<k;i++){
133 // This loop finds the minimum of cols
134 // [colOff, BLOCK_SIZE+colOff, 2*BLOCK_SIZE+colOff,...]
135 // and stores it in mins[locRow][colOff].
136 mins[locRow][colOff]=MAX_REAL;
137 pos[locRow][colOff]=-1;
138 for (j=0;j<D.pc/BLOCK_SIZE;j++){
139 curCol = j*BLOCK_SIZE+colOff;
140 if(curCol < D.c){ //ignore padding
141 temp = D.mat[IDX(row,curCol,D.ld)];
142 if(temp<mins[locRow][colOff]){
143 mins[locRow][colOff]=temp;
144 pos[locRow][colOff]=curCol;
145 }
146 }
147 }
148 __syncthreads();
149
150
151 //Now find the min of cols [0, ... , BLOCK_SIZE]
152 for (j=BLOCK_SIZE/2; j>0; j/=2){
153 if(colOff<j){
154 //compare (col) to (col+j)
155 if(mins[locRow][colOff]>mins[locRow][colOff+j]){
156 mins[locRow][colOff]=mins[locRow][colOff+j];
157 pos[locRow][colOff]=pos[locRow][colOff+j];
158 }
159 }
160 __syncthreads();
161 }
162
163 //arbitrarily use the first thread (along x) to set memory
164 if(threadIdx.x==0 && row<D.r){
165 dMins.mat[IDX(row,i,dMins.ld)] = mins[locRow][0];
166 NNs.mat[IDX(row,i,NNs.ld)] = pos[locRow][0];
167 D.mat[IDX(row,pos[locRow][0],D.ld)]=MAX_REAL;
168
169 }
170 __syncthreads();
171 }
172 }
173
174 size_t countCompute(int*,int*,charMatrix);
175
176 //This is used for debugging/research.
177 size_t countCompute(int *groupCountQ, int *groupCountX, charMatrix cM){
178 int i,j;
179 size_t ans=0;
180 size_t avgBlocks=0;
181 size_t maxBlocks=0;
182 size_t maxBlocksInd;
183 size_t maxTemp;
184 size_t avgBlockQ=0;
185 size_t avgBlockX=0;
186 size_t maxBlockX=0;
187 size_t maxBlockQ=0;
188
189
190 for(i=0;i<cM.c;i++){
191 maxTemp=0;
192 for(j=0;j<cM.r;j++){
193 //printf("%d ",cM.mat[IDX(i,j,cM.ld)]*PAD(groupCountQ[i])*PAD(groupCountX[j]));
194 ans+=cM.mat[IDX(i,j,cM.ld)]*(groupCountQ[i])*(groupCountX[j]);
195 avgBlocks+=cM.mat[IDX(i,j,cM.ld)];
196 maxTemp+=cM.mat[IDX(i,j,cM.ld)]*PAD(groupCountX[j]);
197 }
198 // printf("\n");
199 if(maxBlocks < maxTemp){
200 maxBlocks=maxTemp;
201 maxBlocksInd=PAD(groupCountQ[i]);
202 }
203 //maxBlocks=MAX(maxTemp,maxBlocks);
204 }
205
206 for(i=0;i<cM.c;i++){
207 avgBlockQ+=groupCountQ[i];
208 avgBlockX+=groupCountX[i];
209 maxBlockQ=MAX(maxBlockQ,groupCountQ[i]);
210 maxBlockX=MAX(maxBlockX,groupCountX[i]);
211 }
212
213 printf("most amt of work for a query: %zu (%zu) ; avg = %6.4f \n",maxBlocks,maxBlocksInd,((double)ans)/((double)cM.c));
214 printf("avg blocks/query block = %6.4f ; \n",((double)avgBlocks)/((double)cM.c));
215 printf("avg blockQ = %6.4f; max = %zu \n",((double)avgBlockQ)/((double)cM.c),maxBlockQ);
216 printf("avg blockX = %6.4f; max = %zu \n",((double)avgBlockX)/((double)cM.c),maxBlockX);
217
218 return ans;
219 }
220
221
222 void kMinsWrap(matrix dD, matrix dMins, intMatrix dNNs){
223 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
224 dim3 grid(1,dD.pr/BLOCK_SIZE);
225
226 kMinsKernel<<<grid,block>>>(dD,dMins,dNNs);
227 cudaThreadSynchronize();
228 }
229
230
231 __global__ void kMinsKernel(matrix,matrix,intMatrix);
232
233 __global__ void kMinsKernel(matrix D, matrix dMins, intMatrix NNs){
234
235 int row = blockIdx.y*BLOCK_SIZE + threadIdx.y;
236 int ro = threadIdx.y; //row offset
237 int co = threadIdx.x; //col offset
238
239 __shared__ real smin[BLOCK_SIZE][BLOCK_SIZE];
240 __shared__ int smp[BLOCK_SIZE][BLOCK_SIZE];
241
242 real min, t;
243 int mp; //min position
244 int i,j,c;
245
246
247 for(i=0 ; i<NNs.c ; i++){
248
249 min = MAX_REAL;
250 for(j=0 ; j<D.pc/BLOCK_SIZE ; j++){
251 c = j*BLOCK_SIZE;
252 if( c+co < D.c ){
253 t = D.mat[ IDX( row, c+co, D.ld ) ];
254
255 if( t < min ){
256 min = t;
257 mp = c+co;
258 }
259 }
260 }
261
262 smin[ro][co] = min;
263 smp[ro][co] = mp;
264 __syncthreads();
265
266
267 for(j=BLOCK_SIZE/2 ; j>0 ; j/=2){
268 if( co < j ){
269 if( smin[ro][co+j] < smin[ro][co] ){
270 smin[ro][co] = smin[ro][co+j];
271 smp[ro][co] = smp[ro][co+j];
272 }
273 }
274 __syncthreads();
275 }
276
277 if(co==0){
278 NNs.mat[ IDX( row, i, NNs.ld ) ] = smp[ro][0];
279 dMins.mat[ IDX( row, i, dMins.ld ) ] = smin[ro][0];
280
281 D.mat[ IDX( row, smp[ro][0], D.ld ) ] = MAX_REAL;
282 }
283
284 __syncthreads();
285 }
286 }
287
288 void brute2Step(matrix,matrix,intMatrix);
289
290
291 void brute2Step(matrix x, matrix q, intMatrix NNs){
292
293 matrix dx, dq, dD, dMins;
294 intMatrix dNNs;
295 real *ranges, *dranges;
296
297 copyAndMove(&dx, &x);
298 copyAndMove(&dq, &q);
299 copyAndMoveI(&dNNs, &NNs); //NNs.mat is garbage, but no matter.
300
301 dD.r=q.r; dD.pr=q.pr; dD.c=x.r; dD.pc=x.pr; dD.ld=dD.pc;
302 cudaMalloc( (void**)&dD.mat, dD.pr*dD.pc*sizeof(*dD.mat) );
303
304 dMins.r=NNs.r; dMins.pr=NNs.pr; dMins.c=NNs.c; dMins.pc=NNs.pc; dMins.ld=NNs.ld;
305 cudaMalloc( (void**)&dMins.mat, dMins.pr*dMins.pc*sizeof(*dMins.mat) );
306
307 ranges = (real*)calloc( q.pr, sizeof(*ranges) );
308 cudaMalloc( (void**)&dranges, q.pr*sizeof(*dranges) );
309
310 dist1Wrap(dq,dx,dD);
311
312 kMinsWrap(dD, dMins, dNNs);
313 cudaMemcpy(NNs.mat, dNNs.mat, NNs.pr*NNs.pc*sizeof(*NNs.mat), cudaMemcpyDeviceToHost);
314
315 free(ranges);
316 cudaFree(dranges);
317 cudaFree(dx.mat);
318 cudaFree(dq.mat);
319 cudaFree(dD.mat);
320 cudaFree(dNNs.mat);
321 cudaFree(dMins.mat);
322
323 }
324
325
326 memPlan createMemPlan(int,int);
327
328 memPlan createMemPlan(unint nPts, unint memPerPt){
329 memPlan mp;
330 unsigned int memFree, memTot;
331 unint ptsAtOnce;
332
333 cuMemGetInfo(&memFree, &memTot);
334 memFree = (unint)(((float)memFree)*MEM_USABLE);
335 printf("memfree = %d \n",memFree);
336 ptsAtOnce = DPAD(memFree/memPerPt); //max number of pts that can be processed at once
337 printf("ptsAtOnce = %d \n",ptsAtOnce);
338 mp.numComputeSegs = nPts/ptsAtOnce + ((nPts%ptsAtOnce==0) ? 0 : 1);
339 mp.normSegSize=PAD(nPts/mp.numComputeSegs);
340 mp.lastSegSize=PAD(nPts) - mp.normSegSize*(mp.numComputeSegs-1);
341 //Note that lastSegSize is automatically padded if nPts is.
342 return mp;
343 }
344
345 typedef struct {
346 unint numComputeSegs;
347 unint normSegSize;//The number of points handled in one computation,
348 //though there will always be one leftover segment
349 //with (possibly) a different number of points.
350 unint lastSegSize;//.. and this is it.
351 } memPlan;
352
353
354 void blockIntersection(charMatrix,matrix,real*,real*);
355
356 void blockIntersection(charMatrix cM, matrix dr, real *radiiX, real *radiiQ){
357 matrix dD;
358 real *dradiiX, *dradiiQ;
359 unint pnR = dr.pr;
360 charMatrix dcM;
361
362 dD.r=dD.c=dr.r; dD.pr=dD.pc=dD.ld=dr.pr;
363 dcM.r=cM.r; dcM.c=cM.c; dcM.pr=cM.pr; dcM.pc=cM.pc; dcM.ld=cM.ld;
364
365 checkErr( cudaMalloc((void**)&dD.mat, pnR*pnR*sizeof(*dD.mat)) );
366 checkErr( cudaMalloc((void**)&dradiiX, pnR*sizeof(*dradiiX)) );
367 checkErr( cudaMalloc((void**)&dradiiQ, pnR*sizeof(*dradiiQ)) );
368 checkErr( cudaMalloc((void**)&dcM.mat, dcM.pr*dcM.pc*sizeof(*dcM.mat)) );
369
370 // Copying over the radii. Note that everything after the first dr.r places
371 // on the device variables is undefined.
372 cudaMemcpy(dradiiX,radiiX,dr.r*sizeof(*dradiiX),cudaMemcpyHostToDevice);
373 cudaMemcpy(dradiiQ,radiiQ,dr.r*sizeof(*dradiiQ),cudaMemcpyHostToDevice);
374
375 dist1Wrap(dr, dr, dD);
376 pruneWrap(dcM, dD, dradiiX, dradiiQ);
377
378 cudaMemcpy(cM.mat,dcM.mat,pnR*pnR*sizeof(*dcM.mat),cudaMemcpyDeviceToHost);
379
380 cudaFree(dcM.mat);
381 cudaFree(dradiiQ);
382 cudaFree(dradiiX);
383 cudaFree(dD.mat);
384 }
385
386
387
388 void groupPoints(matrix,unint*,unint*,unint);
389
390 // This function sorts points by their repID. It makes two passes through the
391 // matrix x; one to count the bucket sizes, the next to place points in the
392 // correct bucket. Note that this function allocates a temporary
393 // matrix the size of x, then copies the results over to x at the end. The
394 // sort could be done in place, eg by doing numReps passes through x instead of 2.
395 void groupPoints(matrix x, unint *xID, unint *repIDs, unint numReps){
396 matrix y;
397 unint n=x.r;
398 unint d=x.c;
399 unint i;
400 unint *gS; //groupSize
401 unint *yID;
402
403 yID = (unint*)calloc(n,sizeof(*yID));
404 y.mat = (real*)calloc(n*d,sizeof(*y.mat));
405 gS = (unint*)calloc(numReps+1,sizeof(*gS));
406
407 y.r=n; y.pr=n; y.c=d; y.pc=d; y.ld=d;
408
409 for(i=0;i<n;i++)
410 gS[repIDs[i]+1]++;
411 for(i=1;i<numReps;i++)
412 gS[i]=gS[i-1]+gS[i];
413
414 for(i=0;i<n;i++){
415 copyVector(&y.mat[IDX(gS[repIDs[i]],0,y.ld)], &x.mat[IDX(i,0,x.ld)],d);
416 yID[gS[repIDs[i]]]=xID[i];
417 gS[repIDs[i]]++;
418 }
419
420 for(i=0;i<n;i++){
421 copyVector(&x.mat[IDX(i,0,x.ld)],&y.mat[IDX(i,0,y.ld)],d);
422 xID[i]=yID[i];
423 }
424
425 free(yID);
426 free(gS);
427 free(y.mat);
428 }
429
430 __global__ void pruneKernel(const matrix,const real*,const real*,charMatrix);
431
432
433 __global__ void pruneKernel(const matrix D, const real *radiiX, const real *radiiQ, charMatrix cM){
434 unint offX = threadIdx.x;
435 unint offQ = threadIdx.y;
436
437 unint blockX = blockIdx.x * BLOCK_SIZE;
438 unint blockQ = blockIdx.y * BLOCK_SIZE;
439
440 __shared__ real sD[BLOCK_SIZE][BLOCK_SIZE];
441 __shared__ real sRQ[BLOCK_SIZE];
442 __shared__ real sRX[BLOCK_SIZE];
443
444 sD[offQ][offX]=D.mat[IDX(blockQ+offQ,blockX+offX,D.ld)];
445
446 if(offQ==0)
447 sRX[offX]=radiiX[blockX+offX];
448 if(offX==0)
449 sRQ[offQ]=radiiQ[blockQ+offQ];
450
451 __syncthreads();
452
453 if(blockQ+offQ < D.r && blockX+offX < D.c){
454 cM.mat[IDX(blockQ+offQ,blockX+offX,cM.ld)] = (sD[offQ][offX]-sRX[offX]-2*sRQ[offQ] <= 0) ? 1 : 0;
455 }
456 }
457
458 void pruneWrap(charMatrix dcM, matrix dD, real *dradiiX, real *dradiiQ){
459 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
460 dim3 grid(dD.pr/BLOCK_SIZE,dD.pc/BLOCK_SIZE);
461
462 pruneKernel<<<grid,block>>>(dD,dradiiX,dradiiQ,dcM);
463 cudaThreadSynchronize();
464 }
465
466 void pruneWrap(charMatrix,matrix,real*,real*);