Stable release of RBC
[RBC.git] / deprecated.cu
1 /* This is the more complex version of computeReps. It can be used even if X doesn't
2 fit into the device memory. It does not work in emulation mode, because it checks to see
3 how much mem is available on the device. Thus for debugging purposes we currently
4 use a simpler version of computeReps. */
5
6 //Assumes that dr is a matrix already on the device
7 void computeReps(matrix x, matrix dr, int *repIDs, real *distToReps){
8 int memPerX, segSize; //seg==segment
9 int index, tempSize; //temp variables used in the loop
10 int i;
11 matrix dx;
12 real *dMins;
13 int *dMinIDs;
14 memPlan mp;
15
16 int n = x.r; //For convenience
17
18 // Items that need to go on device: x, repIDs, distToReps. The "+1" is for the
19 // distance from each point to its nearest rep (distToReps) and the int is for
20 // the ID (repIDs).
21 memPerX = (x.pc+1)*sizeof(real)+sizeof(int);
22 mp = createMemPlan(x.r,memPerX);
23
24 for(i=0;i<mp.numComputeSegs;i++){
25 if(i==mp.numComputeSegs-1)
26 segSize = mp.lastSegSize;
27 else
28 segSize = mp.normSegSize;
29
30 //Allocate & copy over data
31 index = IDX(mp.normSegSize*i,0,x.ld);
32 tempSize = segSize*x.pc*sizeof(*(dx.mat));
33
34 cudaMalloc((void**)&(dx.mat),tempSize);
35 cudaMemcpy(dx.mat,&(x.mat[index]),tempSize,cudaMemcpyHostToDevice);
36 dx.r=segSize; dx.c=x.c; dx.pr=dx.r; dx.pc=x.pc; dx.ld=x.ld;
37
38 //Allocate matrices to temporarily store mins and IDs (NOTE:MOVE OUT OF LOOP FOR EFFICIENCY)
39 cudaMalloc((void**)&(dMins), PAD(MIN(segSize,n))*sizeof(*dMins));
40 cudaMalloc((void**)&(dMinIDs), PAD(MIN(segSize,n))*sizeof(*dMinIDs));
41 nnWrap(dx,dr,dMins,dMinIDs);
42
43 cudaMemcpy(&distToReps[i*segSize],dMins,MIN(segSize,n)*sizeof(*dMins),cudaMemcpyDeviceToHost);
44 cudaMemcpy(&repIDs[i*segSize],dMinIDs,MIN(segSize,n)*sizeof(*dMinIDs),cudaMemcpyDeviceToHost);
45
46 cudaFree(dMins);
47 cudaFree(dMinIDs);
48 cudaFree(dx.mat);
49 }
50 }
51
52
53 __global__ void getMinsKernel(matrix,real*,int*);
54
55 // Returns the min of each row of D. dMins and dMinIDs
56 // are assumed to be (at least) of size D.r.
57 __global__ void getMinsKernel(const matrix D, real *dMins, int *dMinIDs){
58 int row, locRow, colOff, i, curCol;
59 real temp;
60
61 row = blockIdx.y*BLOCK_SIZE+threadIdx.y;
62 locRow = threadIdx.y;
63
64 colOff = threadIdx.x; //column offset of this thread
65
66 __shared__ float mins[BLOCK_SIZE][BLOCK_SIZE];
67 __shared__ int pos[BLOCK_SIZE][BLOCK_SIZE];
68
69
70 // This loop finds the minimum of cols
71 // [colOff, BLOCK_SIZE+colOff, 2*BLOCK_SIZE+colOff,...]
72 // and stores it in mins[locRow][colOff].
73 mins[locRow][colOff]=MAX_REAL;
74 pos[locRow][colOff]=-1;
75 for (i=0;i<D.pc/BLOCK_SIZE;i++){
76 curCol = i*BLOCK_SIZE+colOff;
77 if(curCol < D.c){ //ignore padding
78 temp = D.mat[IDX(row,curCol,D.ld)];
79 if(temp<mins[locRow][colOff]){
80 mins[locRow][colOff]=temp;
81 pos[locRow][colOff]=curCol;
82 }
83 }
84 }
85 __syncthreads();
86
87 //Now find the min of cols [0, ... , BLOCK_SIZE]
88 for (i=BLOCK_SIZE/2; i>0;i/=2){
89 if(colOff<i){
90 //compare (col) to (col+i)
91 if(mins[locRow][colOff]>mins[locRow][colOff+i]){
92 mins[locRow][colOff]=mins[locRow][colOff+i];
93 pos[locRow][colOff]=pos[locRow][colOff+i];
94 }
95 }
96 __syncthreads();
97 }
98
99 //arbitrarily use the first thread (along x) to set memory
100 if(threadIdx.x==0){
101 dMins[row] = mins[locRow][0];
102 dMinIDs[row] = pos[locRow][0];
103 }
104 }
105
106
107 // Returns the min of each row of D. dMins and dMinIDs
108 // are assumed to be (at least) of size D.r.
109 __global__ void getKMinsKernel(matrix D, matrix dMins, intMatrix NNs, int k){
110 int row, locRow, colOff, i, curCol,j;
111 real temp;
112
113 row = blockIdx.y*BLOCK_SIZE+threadIdx.y;
114 locRow = threadIdx.y;
115
116 //printf("row=%d D.r =%d \n",row,D.r);
117 /* if(row>=D.r) */
118 /* return; */
119
120 colOff = threadIdx.x; //column offset of this thread
121
122 __shared__ float mins[BLOCK_SIZE][BLOCK_SIZE];
123 __shared__ int pos[BLOCK_SIZE][BLOCK_SIZE];
124
125 for(i=0;i<k;i++){
126 // This loop finds the minimum of cols
127 // [colOff, BLOCK_SIZE+colOff, 2*BLOCK_SIZE+colOff,...]
128 // and stores it in mins[locRow][colOff].
129 mins[locRow][colOff]=MAX_REAL;
130 pos[locRow][colOff]=-1;
131 for (j=0;j<D.pc/BLOCK_SIZE;j++){
132 curCol = j*BLOCK_SIZE+colOff;
133 if(curCol < D.c){ //ignore padding
134 temp = D.mat[IDX(row,curCol,D.ld)];
135 if(temp<mins[locRow][colOff]){
136 mins[locRow][colOff]=temp;
137 pos[locRow][colOff]=curCol;
138 }
139 }
140 }
141 __syncthreads();
142
143
144 //Now find the min of cols [0, ... , BLOCK_SIZE]
145 for (j=BLOCK_SIZE/2; j>0; j/=2){
146 if(colOff<j){
147 //compare (col) to (col+j)
148 if(mins[locRow][colOff]>mins[locRow][colOff+j]){
149 mins[locRow][colOff]=mins[locRow][colOff+j];
150 pos[locRow][colOff]=pos[locRow][colOff+j];
151 }
152 }
153 __syncthreads();
154 }
155
156 //arbitrarily use the first thread (along x) to set memory
157 if(threadIdx.x==0 && row<D.r){
158 dMins.mat[IDX(row,i,dMins.ld)] = mins[locRow][0];
159 NNs.mat[IDX(row,i,NNs.ld)] = pos[locRow][0];
160 D.mat[IDX(row,pos[locRow][0],D.ld)]=MAX_REAL;
161
162 }
163 __syncthreads();
164 }
165 }
166
167 size_t countCompute(int*,int*,charMatrix);
168
169 //This is used for debugging/research.
170 size_t countCompute(int *groupCountQ, int *groupCountX, charMatrix cM){
171 int i,j;
172 size_t ans=0;
173 size_t avgBlocks=0;
174 size_t maxBlocks=0;
175 size_t maxBlocksInd;
176 size_t maxTemp;
177 size_t avgBlockQ=0;
178 size_t avgBlockX=0;
179 size_t maxBlockX=0;
180 size_t maxBlockQ=0;
181
182
183 for(i=0;i<cM.c;i++){
184 maxTemp=0;
185 for(j=0;j<cM.r;j++){
186 //printf("%d ",cM.mat[IDX(i,j,cM.ld)]*PAD(groupCountQ[i])*PAD(groupCountX[j]));
187 ans+=cM.mat[IDX(i,j,cM.ld)]*(groupCountQ[i])*(groupCountX[j]);
188 avgBlocks+=cM.mat[IDX(i,j,cM.ld)];
189 maxTemp+=cM.mat[IDX(i,j,cM.ld)]*PAD(groupCountX[j]);
190 }
191 // printf("\n");
192 if(maxBlocks < maxTemp){
193 maxBlocks=maxTemp;
194 maxBlocksInd=PAD(groupCountQ[i]);
195 }
196 //maxBlocks=MAX(maxTemp,maxBlocks);
197 }
198
199 for(i=0;i<cM.c;i++){
200 avgBlockQ+=groupCountQ[i];
201 avgBlockX+=groupCountX[i];
202 maxBlockQ=MAX(maxBlockQ,groupCountQ[i]);
203 maxBlockX=MAX(maxBlockX,groupCountX[i]);
204 }
205
206 printf("most amt of work for a query: %zu (%zu) ; avg = %6.4f \n",maxBlocks,maxBlocksInd,((double)ans)/((double)cM.c));
207 printf("avg blocks/query block = %6.4f ; \n",((double)avgBlocks)/((double)cM.c));
208 printf("avg blockQ = %6.4f; max = %zu \n",((double)avgBlockQ)/((double)cM.c),maxBlockQ);
209 printf("avg blockX = %6.4f; max = %zu \n",((double)avgBlockX)/((double)cM.c),maxBlockX);
210
211 return ans;
212 }
213
214
215 void kMinsWrap(matrix dD, matrix dMins, intMatrix dNNs){
216 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
217 dim3 grid(1,dD.pr/BLOCK_SIZE);
218
219 kMinsKernel<<<grid,block>>>(dD,dMins,dNNs);
220 cudaThreadSynchronize();
221 }
222
223
224 __global__ void kMinsKernel(matrix,matrix,intMatrix);
225
226 __global__ void kMinsKernel(matrix D, matrix dMins, intMatrix NNs){
227
228 int row = blockIdx.y*BLOCK_SIZE + threadIdx.y;
229 int ro = threadIdx.y; //row offset
230 int co = threadIdx.x; //col offset
231
232 __shared__ real smin[BLOCK_SIZE][BLOCK_SIZE];
233 __shared__ int smp[BLOCK_SIZE][BLOCK_SIZE];
234
235 real min, t;
236 int mp; //min position
237 int i,j,c;
238
239
240 for(i=0 ; i<NNs.c ; i++){
241
242 min = MAX_REAL;
243 for(j=0 ; j<D.pc/BLOCK_SIZE ; j++){
244 c = j*BLOCK_SIZE;
245 if( c+co < D.c ){
246 t = D.mat[ IDX( row, c+co, D.ld ) ];
247
248 if( t < min ){
249 min = t;
250 mp = c+co;
251 }
252 }
253 }
254
255 smin[ro][co] = min;
256 smp[ro][co] = mp;
257 __syncthreads();
258
259
260 for(j=BLOCK_SIZE/2 ; j>0 ; j/=2){
261 if( co < j ){
262 if( smin[ro][co+j] < smin[ro][co] ){
263 smin[ro][co] = smin[ro][co+j];
264 smp[ro][co] = smp[ro][co+j];
265 }
266 }
267 __syncthreads();
268 }
269
270 if(co==0){
271 NNs.mat[ IDX( row, i, NNs.ld ) ] = smp[ro][0];
272 dMins.mat[ IDX( row, i, dMins.ld ) ] = smin[ro][0];
273
274 D.mat[ IDX( row, smp[ro][0], D.ld ) ] = MAX_REAL;
275 }
276
277 __syncthreads();
278 }
279 }
280
281 void brute2Step(matrix,matrix,intMatrix);
282
283
284 void brute2Step(matrix x, matrix q, intMatrix NNs){
285
286 matrix dx, dq, dD, dMins;
287 intMatrix dNNs;
288 real *ranges, *dranges;
289
290 copyAndMove(&dx, &x);
291 copyAndMove(&dq, &q);
292 copyAndMoveI(&dNNs, &NNs); //NNs.mat is garbage, but no matter.
293
294 dD.r=q.r; dD.pr=q.pr; dD.c=x.r; dD.pc=x.pr; dD.ld=dD.pc;
295 cudaMalloc( (void**)&dD.mat, dD.pr*dD.pc*sizeof(*dD.mat) );
296
297 dMins.r=NNs.r; dMins.pr=NNs.pr; dMins.c=NNs.c; dMins.pc=NNs.pc; dMins.ld=NNs.ld;
298 cudaMalloc( (void**)&dMins.mat, dMins.pr*dMins.pc*sizeof(*dMins.mat) );
299
300 ranges = (real*)calloc( q.pr, sizeof(*ranges) );
301 cudaMalloc( (void**)&dranges, q.pr*sizeof(*dranges) );
302
303 dist1Wrap(dq,dx,dD);
304
305 kMinsWrap(dD, dMins, dNNs);
306 cudaMemcpy(NNs.mat, dNNs.mat, NNs.pr*NNs.pc*sizeof(*NNs.mat), cudaMemcpyDeviceToHost);
307
308 free(ranges);
309 cudaFree(dranges);
310 cudaFree(dx.mat);
311 cudaFree(dq.mat);
312 cudaFree(dD.mat);
313 cudaFree(dNNs.mat);
314 cudaFree(dMins.mat);
315
316 }