Basic cleanup
[RBC.git] / sKernel.cu
1 #ifndef SKERNEL_CU
2 #define SKERNEL_CU
3
4 #include<stdio.h>
5 #include "sKernel.h"
6 #include "defs.h"
7
8 __global__ void sumKernel(charMatrix in, intMatrix sum, intMatrix sumaux, int n){
9 int id = threadIdx.x;
10 int bo = blockIdx.x*SCAN_WIDTH; //block offset
11 int r = blockIdx.y;
12 int d, t;
13
14 const int l=SCAN_WIDTH; //length
15
16 int off=1;
17
18 __shared__ int ssum[l];
19
20
21 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
22 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
23
24 //up-sweep
25 for( d=l>>1; d > 0; d>>=1 ){
26 __syncthreads();
27
28 if( id < d ){
29 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
30 }
31 off *= 2;
32 }
33
34 __syncthreads();
35
36 if ( id == 0 ){
37 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
38 ssum[ l-1 ] = 0;
39 }
40
41 //down-sweep
42 for ( d=1; d<l; d*=2 ){
43 off >>= 1;
44 __syncthreads();
45
46 if( id < d ){
47 t = ssum[ off*(2*id+1)-1 ];
48 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
49 ssum[ off*(2*id+2)-1 ] += t;
50 }
51 }
52
53 __syncthreads();
54
55 if( bo+2*id < n )
56 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
57 if( bo+2*id+1 < n )
58 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
59
60 }
61
62 __global__ void combineSumKernel(intMatrix sum, intMatrix daux, int n){
63 int id = threadIdx.x;
64 int bo = blockIdx.x * SCAN_WIDTH;
65 int r = blockIdx.y;
66
67 if(bo+2*id < n)
68 sum.mat[IDX( r, bo+2*id, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
69 if(bo+2*id+1 < n)
70 sum.mat[IDX( r, bo+2*id+1, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
71
72 }
73
74
75 __global__ void getCountsKernel(int *counts, charMatrix ir, intMatrix sums){
76 int r = blockIdx.x*BLOCK_SIZE + threadIdx.x;
77 if ( r < ir.r )
78 counts[r] = ir.mat[IDX( r, ir.c-1, ir.ld )] ? sums.mat[IDX( r, sums.c-1, sums.ld )]+1 : sums.mat[IDX( r, sums.c-1, sums.ld )];
79 }
80
81
82 __global__ void buildMapKernel(intMatrix map, charMatrix ir, intMatrix sums, int offSet){
83 int id = threadIdx.x;
84 int bo = blockIdx.x * SCAN_WIDTH;
85 int r = blockIdx.y;
86
87 if(bo+2*id < ir.c && ir.mat[IDX( r, bo+2*id, ir.ld )])
88 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id, sums.ld )], map.ld)] = bo+2*id;
89 if(bo+2*id+1 < ir.c && ir.mat[IDX( r, bo+2*id+1, ir.ld )])
90 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id+1, sums.ld )], map.ld)] = bo+2*id+1;
91 }
92
93
94 void getCountsWrap(int *counts, charMatrix ir, intMatrix sums){
95 dim3 block(BLOCK_SIZE,1);
96 dim3 grid(ir.pr/BLOCK_SIZE,1);
97 getCountsKernel<<<grid,block>>>(counts, ir, sums);
98
99 }
100
101
102 void buildMapWrap(intMatrix map, charMatrix ir, intMatrix sums, int offSet){
103 int numScans = (ir.c+SCAN_WIDTH-1)/SCAN_WIDTH;
104 dim3 block( SCAN_WIDTH/2, 1 );
105 dim3 grid( numScans, ir.r );
106 buildMapKernel<<<grid,block>>>(map, ir, sums, offSet);
107 }
108
109
110 void sumWrap(charMatrix in, intMatrix sum){
111 int n = in.c;
112 int numScans = (n+SCAN_WIDTH-1)/SCAN_WIDTH;
113
114 if(numScans > SCAN_WIDTH){
115 fprintf(stderr,"scan is too large.. exiting \n");
116 exit(1);
117 }
118
119 intMatrix dAux;
120 dAux.r=dAux.pr=in.r; dAux.c=dAux.pc=dAux.ld=numScans;
121 intMatrix dDummy;
122 dDummy.r=dDummy.pr=in.r; dDummy.c=dDummy.pc=dDummy.ld=1;
123
124 cudaMalloc( (void**)&dAux.mat, dAux.pr*dAux.pc*sizeof(*dAux.mat) );
125 cudaMalloc( (void**)&dDummy.mat, dDummy.pr*dDummy.pc*sizeof(*dDummy.mat) );
126
127 dim3 block( SCAN_WIDTH/2, 1 );
128 dim3 grid( numScans, in.r );
129
130 sumKernel<<<grid,block>>>(in, sum, dAux, n);
131 cudaThreadSynchronize();
132
133 //While(){}
134 grid.x=1;
135
136 sumKernelI<<<grid,block>>>(dAux, dAux, dDummy, numScans);
137 cudaThreadSynchronize();
138
139 grid.x=numScans;
140 combineSumKernel<<<grid,block>>>(sum,dAux,n);
141 cudaThreadSynchronize();
142
143 cudaFree(dAux.mat);
144 cudaFree(dDummy.mat);
145
146 }
147
148
149 //This is the same as sumKernel, but takes an int matrix as input.
150 __global__ void sumKernelI(intMatrix in, intMatrix sum, intMatrix sumaux, int n){
151 int id = threadIdx.x;
152 int bo = blockIdx.x*SCAN_WIDTH; //block offset
153 int r = blockIdx.y;
154 int d, t;
155
156 const int l=SCAN_WIDTH; //length
157
158 int off=1;
159
160 __shared__ int ssum[l];
161
162
163 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
164 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
165
166 //up-sweep
167 for( d=l>>1; d > 0; d>>=1 ){
168 __syncthreads();
169
170 if( id < d ){
171 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
172 }
173 off *= 2;
174 }
175
176 __syncthreads();
177
178 if ( id == 0 ){
179 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
180 ssum[ l-1 ] = 0;
181 }
182
183 //down-sweep
184 for ( d=1; d<l; d*=2 ){
185 off >>= 1;
186 __syncthreads();
187
188 if( id < d ){
189 t = ssum[ off*(2*id+1)-1 ];
190 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
191 ssum[ off*(2*id+2)-1 ] += t;
192 }
193
194 }
195
196 __syncthreads();
197
198 if( bo+2*id < n )
199 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
200 if( bo+2*id+1 < n )
201 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
202
203 }
204
205 #endif