Cleaned up, debugged. Ready for 1st release
[RBC.git] / kernelWrap.cu
1 /* This file is part of the Random Ball Cover (RBC) library.
2 * (C) Copyright 2010, Lawrence Cayton [lcayton@tuebingen.mpg.de]
3 */
4
5 #ifndef KERNELWRAP_CU
6 #define KERNELWRAP_CU
7
8 #include<cuda.h>
9 #include<stdio.h>
10 #include "kernels.h"
11 #include "defs.h"
12
13 void dist1Wrap(const matrix dq, const matrix dx, matrix dD){
14 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
15 dim3 grid;
16
17 unint todoX, todoY, numDoneX, numDoneY;
18
19 numDoneX = 0;
20 while ( numDoneX < dx.pr ){
21 todoX = MIN( dx.pr - numDoneX, MAX_BS*BLOCK_SIZE );
22 grid.x = todoX/BLOCK_SIZE;
23 numDoneY = 0;
24 while( numDoneY < dq.pr ){
25 todoY = MIN( dq.pr - numDoneY, MAX_BS*BLOCK_SIZE );
26 grid.y = todoY/BLOCK_SIZE;
27 dist1Kernel<<<grid,block>>>(dq, numDoneY, dx, numDoneX, dD);
28 numDoneY += todoY;
29 }
30 numDoneX += todoX;
31 }
32
33 cudaThreadSynchronize();
34 }
35
36
37 void findRangeWrap(const matrix dD, real *dranges, unint cntWant){
38 dim3 block(4*BLOCK_SIZE,BLOCK_SIZE/4);
39 dim3 grid(1,4*(dD.pr/BLOCK_SIZE));
40 unint numDone, todo;
41
42 numDone=0;
43 while( numDone < dD.pr ){
44 todo = MIN ( dD.pr - numDone, MAX_BS*BLOCK_SIZE/4 );
45 grid.y = 4*(todo/BLOCK_SIZE);
46 findRangeKernel<<<grid,block>>>(dD, numDone, dranges, cntWant);
47 numDone += todo;
48 }
49 cudaThreadSynchronize();
50 }
51
52
53 void rangeSearchWrap(const matrix dD, const real *dranges, charMatrix dir){
54 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
55 dim3 grid;
56
57 unint todoX, todoY, numDoneX, numDoneY;
58
59 numDoneX = 0;
60 while ( numDoneX < dD.pc ){
61 todoX = MIN( dD.pc - numDoneX, MAX_BS*BLOCK_SIZE );
62 grid.x = todoX/BLOCK_SIZE;
63 numDoneY = 0;
64 while( numDoneY < dD.pr ){
65 todoY = MIN( dD.pr - numDoneY, MAX_BS*BLOCK_SIZE );
66 grid.y = todoY/BLOCK_SIZE;
67 rangeSearchKernel<<<grid,block>>>(dD, numDoneX, numDoneY, dranges, dir);
68 numDoneY += todoY;
69 }
70 numDoneX += todoX;
71 }
72
73 cudaThreadSynchronize();
74 }
75
76 void nnWrap(const matrix dq, const matrix dx, real *dMins, unint *dMinIDs){
77 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
78 dim3 grid;
79 unint numDone, todo;
80
81 grid.x = 1;
82
83 numDone = 0;
84 while( numDone < dq.pr ){
85 todo = MIN( dq.pr - numDone, MAX_BS*BLOCK_SIZE );
86 grid.y = todo/BLOCK_SIZE;
87 nnKernel<<<grid,block>>>(dq,numDone,dx,dMins,dMinIDs);
88 numDone += todo;
89 }
90 cudaThreadSynchronize();
91
92 }
93
94
95 void planNNWrap(const matrix dq, const unint *dqMap, const matrix dx, const intMatrix dxMap, real *dMins, unint *dMinIDs, compPlan dcP, unint compLength){
96 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
97 dim3 grid;
98 unint todo;
99
100 grid.x = 1;
101 unint numDone = 0;
102 while( numDone<compLength ){
103 todo = MIN( (compLength-numDone) , MAX_BS*BLOCK_SIZE );
104 grid.y = todo/BLOCK_SIZE;
105 planNNKernel<<<grid,block>>>(dq,dqMap,dx,dxMap,dMins,dMinIDs,dcP,numDone);
106 numDone += todo;
107 }
108 cudaThreadSynchronize();
109 }
110
111
112 void rangeCountWrap(const matrix dq, const matrix dx, real *dranges, unint *dcounts){
113 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
114 dim3 grid;
115 unint numDone, todo;
116
117 grid.x=1;
118
119 numDone = 0;
120 while( numDone < dq.pr ){
121 todo = MIN( dq.pr - numDone, MAX_BS*BLOCK_SIZE );
122 grid.y = todo/BLOCK_SIZE;
123 rangeCountKernel<<<grid,block>>>(dq,numDone,dx,dranges,dcounts);
124 numDone += todo;
125 }
126 cudaThreadSynchronize();
127 }
128
129 #endif