Basic cleanup
[RBC.git] / kernelWrap.cu
1 #ifndef KERNELWRAP_CU
2 #define KERNELWRAP_CU
3
4 #include<cuda.h>
5 #include<stdio.h>
6 #include "kernels.h"
7 #include "defs.h"
8
9 void dist1Wrap(matrix dq, matrix dx, matrix dD){
10 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
11 dim3 grid;
12
13 int todoX, todoY, numDoneX, numDoneY;
14
15 numDoneX = 0;
16 while ( numDoneX < dx.pr ){
17 todoX = min( dx.pr - numDoneX, MAX_BS*BLOCK_SIZE );
18 grid.x = todoX/BLOCK_SIZE;
19 numDoneY = 0;
20 while( numDoneY < dq.pr ){
21 todoY = min( dq.pr - numDoneY, MAX_BS*BLOCK_SIZE );
22 grid.y = todoY/BLOCK_SIZE;
23 dist1Kernel<<<grid,block>>>(dq, numDoneY, dx, numDoneX, dD);
24 numDoneY += todoY;
25 }
26 numDoneX += todoX;
27 }
28
29 cudaThreadSynchronize();
30 }
31
32
33 void findRangeWrap(matrix dD, real *dranges, int cntWant){
34 dim3 block(4*BLOCK_SIZE,BLOCK_SIZE/4);
35 dim3 grid(1,4*(dD.pr/BLOCK_SIZE));
36
37 findRangeKernel<<<grid,block>>>(dD,dranges,cntWant);
38
39
40 cudaThreadSynchronize();
41 }
42
43 void rangeSearchWrap(matrix dD, real *dranges, charMatrix dir){
44 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
45 dim3 grid(dD.pc/BLOCK_SIZE,dD.pr/BLOCK_SIZE);
46
47 int todoX, todoY, numDoneX, numDoneY;
48
49 numDoneX = 0;
50 while ( numDoneX < dD.pc ){
51 todoX = min( dD.pc - numDoneX, MAX_BS*BLOCK_SIZE );
52 grid.x = todoX/BLOCK_SIZE;
53 numDoneY = 0;
54 while( numDoneY < dD.pr ){
55 todoY = min( dD.pr - numDoneY, MAX_BS*BLOCK_SIZE );
56 grid.y = todoY/BLOCK_SIZE;
57 rangeSearchKernel<<<grid,block>>>(dD, numDoneX, numDoneY, dranges, dir);
58 numDoneY += todoY;
59 }
60 numDoneX += todoX;
61 }
62
63 cudaThreadSynchronize();
64 }
65
66 void nnWrap(const matrix dx, const matrix dy, real *dMins, int *dMinIDs){
67 dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
68 dim3 dimGrid;
69
70 dimGrid.x = 1;
71 dimGrid.y = dx.pr/dimBlock.y + (dx.pr%dimBlock.y==0 ? 0 : 1);
72 nnKernel<<<dimGrid,dimBlock>>>(dx,dy,dMins,dMinIDs);
73 cudaThreadSynchronize();
74 }
75
76
77 void rangeCountWrap(const matrix dq, const matrix dx, real *dranges, int *dcounts){
78 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
79 dim3 grid(1,dq.pr/BLOCK_SIZE);
80
81 rangeCountKernel<<<grid,block>>>(dq,dx,dranges,dcounts);
82 cudaThreadSynchronize();
83 }
84
85
86 /*NOTE: can be deleted */
87 void pruneWrap(charMatrix dcM, matrix dD, real *dradiiX, real *dradiiQ){
88 dim3 block(BLOCK_SIZE,BLOCK_SIZE);
89 dim3 grid(dD.pr/BLOCK_SIZE,dD.pc/BLOCK_SIZE);
90
91 pruneKernel<<<grid,block>>>(dD,dradiiX,dradiiQ,dcM);
92 cudaThreadSynchronize();
93 }
94 #endif