caught host memory leak
[RBC.git] / kernelWrap.cu
1 /* This file is part of the Random Ball Cover (RBC) library.
2 * (C) Copyright 2010, Lawrence Cayton [lcayton@tuebingen.mpg.de]
3 */
4
5 #ifndef KERNELWRAP_CU
6 #define KERNELWRAP_CU
7
8 #include<cuda.h>
9 #include<stdio.h>
10 #include "kernels.h"
11 #include "defs.h"
12
13 void dist1Wrap(const matrix dq, const matrix dx, matrix dD){
14 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
15 dim3 grid;
16
17 unint todoX, todoY, numDoneX, numDoneY;
18
19 numDoneX = 0;
20 while ( numDoneX < dx.pr ){
21 todoX = MIN( dx.pr - numDoneX, MAX_BS*BLOCK_SIZE );
22 grid.x = todoX/BLOCK_SIZE;
23 numDoneY = 0;
24 while( numDoneY < dq.pr ){
25 todoY = MIN( dq.pr - numDoneY, MAX_BS*BLOCK_SIZE );
26 grid.y = todoY/BLOCK_SIZE;
27 dist1Kernel<<<grid,block>>>(dq, numDoneY, dx, numDoneX, dD);
28 numDoneY += todoY;
29 }
30 numDoneX += todoX;
31 }
32
33 cudaThreadSynchronize();
34 }
35
36
37 void findRangeWrap(const matrix dD, real *dranges, unint cntWant){
38 dim3 block(4*BLOCK_SIZE,BLOCK_SIZE/4);
39 dim3 grid(1,4*(dD.pr/BLOCK_SIZE));
40 unint numDone, todo;
41
42 numDone=0;
43 while( numDone < dD.pr ){
44 todo = MIN ( dD.pr - numDone, MAX_BS*BLOCK_SIZE/4 );
45 grid.y = 4*(todo/BLOCK_SIZE);
46 findRangeKernel<<<grid,block>>>(dD, numDone, dranges, cntWant);
47 numDone += todo;
48 }
49 cudaThreadSynchronize();
50 }
51
52
53 void rangeSearchWrap(const matrix dD, const real *dranges, charMatrix dir){
54 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
55 dim3 grid;
56
57 unint todoX, todoY, numDoneX, numDoneY;
58
59 numDoneX = 0;
60 while ( numDoneX < dD.pc ){
61 todoX = MIN( dD.pc - numDoneX, MAX_BS*BLOCK_SIZE );
62 grid.x = todoX/BLOCK_SIZE;
63 numDoneY = 0;
64 while( numDoneY < dD.pr ){
65 todoY = MIN( dD.pr - numDoneY, MAX_BS*BLOCK_SIZE );
66 grid.y = todoY/BLOCK_SIZE;
67 rangeSearchKernel<<<grid,block>>>(dD, numDoneX, numDoneY, dranges, dir);
68 numDoneY += todoY;
69 }
70 numDoneX += todoX;
71 }
72
73 cudaThreadSynchronize();
74 }
75
76 void nnWrap(const matrix dq, const matrix dx, real *dMins, unint *dMinIDs){
77 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
78 dim3 grid;
79 unint numDone, todo;
80
81 grid.x = 1;
82
83 numDone = 0;
84 while( numDone < dq.pr ){
85 todo = MIN( dq.pr - numDone, MAX_BS*BLOCK_SIZE );
86 grid.y = todo/BLOCK_SIZE;
87 nnKernel<<<grid,block>>>(dq,numDone,dx,dMins,dMinIDs);
88 numDone += todo;
89 }
90 cudaThreadSynchronize();
91
92 }
93
94
95 void knnWrap(const matrix dq, const matrix dx, matrix dMins, intMatrix dMinIDs){
96 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
97 dim3 grid;
98 unint numDone, todo;
99
100 grid.x = 1;
101
102 numDone = 0;
103 while( numDone < dq.pr ){
104 todo = MIN( dq.pr - numDone, MAX_BS*BLOCK_SIZE );
105 grid.y = todo/BLOCK_SIZE;
106 knnKernel<<<grid,block>>>(dq,numDone,dx,dMins,dMinIDs);
107 numDone += todo;
108 }
109 cudaThreadSynchronize();
110
111 }
112
113
114 void planNNWrap(const matrix dq, const unint *dqMap, const matrix dx, const intMatrix dxMap, real *dMins, unint *dMinIDs, compPlan dcP, unint compLength){
115 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
116 dim3 grid;
117 unint todo;
118
119 grid.x = 1;
120 unint numDone = 0;
121 while( numDone<compLength ){
122 todo = MIN( (compLength-numDone) , MAX_BS*BLOCK_SIZE );
123 grid.y = todo/BLOCK_SIZE;
124 planNNKernel<<<grid,block>>>(dq,dqMap,dx,dxMap,dMins,dMinIDs,dcP,numDone);
125 numDone += todo;
126 }
127 cudaThreadSynchronize();
128 }
129
130
131 void planKNNWrap(const matrix dq, const unint *dqMap, const matrix dx, const intMatrix dxMap, matrix dMins, intMatrix dMinIDs, compPlan dcP, unint compLength){
132 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
133 dim3 grid;
134 unint todo;
135
136 grid.x = 1;
137 unint numDone = 0;
138 while( numDone<compLength ){
139 todo = MIN( (compLength-numDone) , MAX_BS*BLOCK_SIZE );
140 grid.y = todo/BLOCK_SIZE;
141 planKNNKernel<<<grid,block>>>(dq,dqMap,dx,dxMap,dMins,dMinIDs,dcP,numDone);
142 numDone += todo;
143 }
144 cudaThreadSynchronize();
145 }
146
147
148
149 void rangeCountWrap(const matrix dq, const matrix dx, real *dranges, unint *dcounts){
150 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
151 dim3 grid;
152 unint numDone, todo;
153
154 grid.x=1;
155
156 numDone = 0;
157 while( numDone < dq.pr ){
158 todo = MIN( dq.pr - numDone, MAX_BS*BLOCK_SIZE );
159 grid.y = todo/BLOCK_SIZE;
160 rangeCountKernel<<<grid,block>>>(dq,numDone,dx,dranges,dcounts);
161 numDone += todo;
162 }
163 cudaThreadSynchronize();
164 }
165
166 #endif