Added text files; minor cleanup
[RBC.git] / sKernel.cu
1 #ifndef SKERNEL_CU
2 #define SKERNEL_CU
3
4 #include<stdio.h>
5 #include "sKernel.h"
6 #include "defs.h"
7
8 __global__ void sumKernel(charMatrix in, intMatrix sum, intMatrix sumaux, int n){
9 int id = threadIdx.x;
10 int bo = blockIdx.x*SCAN_WIDTH; //block offset
11 int r = blockIdx.y;
12 int d, t;
13
14 const int l=SCAN_WIDTH; //length
15
16 int off=1;
17
18 __shared__ int ssum[l];
19
20
21 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
22 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
23
24 //up-sweep
25 for( d=l>>1; d > 0; d>>=1 ){
26 __syncthreads();
27
28 if( id < d ){
29 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
30 }
31 off *= 2;
32 }
33
34 __syncthreads();
35
36 if ( id == 0 ){
37 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
38 ssum[ l-1 ] = 0;
39 }
40
41 //down-sweep
42 for ( d=1; d<l; d*=2 ){
43 off >>= 1;
44 __syncthreads();
45
46 if( id < d ){
47 t = ssum[ off*(2*id+1)-1 ];
48 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
49 ssum[ off*(2*id+2)-1 ] += t;
50 }
51 }
52
53 __syncthreads();
54
55 if( bo+2*id < n )
56 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
57 if( bo+2*id+1 < n )
58 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
59
60 }
61
62 __global__ void combineSumKernel(intMatrix sum, intMatrix daux, int n){
63 int id = threadIdx.x;
64 int bo = blockIdx.x * SCAN_WIDTH;
65 int r = blockIdx.y;
66
67 if(bo+2*id < n)
68 sum.mat[IDX( r, bo+2*id, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
69 if(bo+2*id+1 < n)
70 sum.mat[IDX( r, bo+2*id+1, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
71
72 }
73
74
75 __global__ void getCountsKernel(int *counts, charMatrix ir, intMatrix sums){
76 int r = blockIdx.x*BLOCK_SIZE + threadIdx.x;
77 if ( r < ir.r )
78 counts[r] = ir.mat[IDX( r, ir.c-1, ir.ld )] ? sums.mat[IDX( r, sums.c-1, sums.ld )]+1 : sums.mat[IDX( r, sums.c-1, sums.ld )];
79 }
80
81
82 __global__ void buildMapKernel(intMatrix map, charMatrix ir, intMatrix sums, int offSet){
83 int id = threadIdx.x;
84 int bo = blockIdx.x * SCAN_WIDTH;
85 int r = blockIdx.y;
86
87 if(bo+2*id < ir.c && ir.mat[IDX( r, bo+2*id, ir.ld )])
88 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id, sums.ld )], map.ld)] = bo+2*id;
89 if(bo+2*id+1 < ir.c && ir.mat[IDX( r, bo+2*id+1, ir.ld )])
90 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id+1, sums.ld )], map.ld)] = bo+2*id+1;
91 }
92
93
94 void getCountsWrap(int *counts, charMatrix ir, intMatrix sums){
95 dim3 block(BLOCK_SIZE,1);
96 dim3 grid(ir.pr/BLOCK_SIZE,1);
97 getCountsKernel<<<grid,block>>>(counts, ir, sums);
98
99 }
100
101
102 void buildMapWrap(intMatrix map, charMatrix ir, intMatrix sums, int offSet){
103 int numScans = (ir.c+SCAN_WIDTH-1)/SCAN_WIDTH;
104 dim3 block( SCAN_WIDTH/2, 1 );
105 dim3 grid( numScans, ir.r );
106 buildMapKernel<<<grid,block>>>(map, ir, sums, offSet);
107 }
108
109
110 void sumWrap(charMatrix in, intMatrix sum){
111 int n = in.c;
112 int numScans = (n+SCAN_WIDTH-1)/SCAN_WIDTH;
113
114 if(numScans > SCAN_WIDTH){
115 fprintf(stderr,"scan is too large.. exiting \n");
116 exit(1);
117 }
118
119 intMatrix dAux;
120 dAux.r=dAux.pr=in.r; dAux.c=dAux.pc=dAux.ld=numScans;
121 intMatrix dDummy;
122 dDummy.r=dDummy.pr=in.r; dDummy.c=dDummy.pc=dDummy.ld=1;
123
124 cudaMalloc( (void**)&dAux.mat, dAux.pr*dAux.pc*sizeof(*dAux.mat) );
125 cudaMalloc( (void**)&dDummy.mat, dDummy.pr*dDummy.pc*sizeof(*dDummy.mat) );
126
127 dim3 block( SCAN_WIDTH/2, 1 );
128 dim3 grid( numScans, in.r );
129
130 sumKernel<<<grid,block>>>(in, sum, dAux, n);
131 cudaThreadSynchronize();
132 grid.x=1;
133
134 sumKernelI<<<grid,block>>>(dAux, dAux, dDummy, numScans);
135 cudaThreadSynchronize();
136
137 grid.x=numScans;
138 combineSumKernel<<<grid,block>>>(sum,dAux,n);
139 cudaThreadSynchronize();
140
141 cudaFree(dAux.mat);
142 cudaFree(dDummy.mat);
143
144 }
145
146
147 //This is the same as sumKernel, but takes an int matrix as input.
148 __global__ void sumKernelI(intMatrix in, intMatrix sum, intMatrix sumaux, int n){
149 int id = threadIdx.x;
150 int bo = blockIdx.x*SCAN_WIDTH; //block offset
151 int r = blockIdx.y;
152 int d, t;
153
154 const int l=SCAN_WIDTH; //length
155
156 int off=1;
157
158 __shared__ int ssum[l];
159
160
161 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
162 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
163
164 //up-sweep
165 for( d=l>>1; d > 0; d>>=1 ){
166 __syncthreads();
167
168 if( id < d ){
169 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
170 }
171 off *= 2;
172 }
173
174 __syncthreads();
175
176 if ( id == 0 ){
177 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
178 ssum[ l-1 ] = 0;
179 }
180
181 //down-sweep
182 for ( d=1; d<l; d*=2 ){
183 off >>= 1;
184 __syncthreads();
185
186 if( id < d ){
187 t = ssum[ off*(2*id+1)-1 ];
188 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
189 ssum[ off*(2*id+2)-1 ] += t;
190 }
191
192 }
193
194 __syncthreads();
195
196 if( bo+2*id < n )
197 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
198 if( bo+2*id+1 < n )
199 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
200
201 }
202
203 #endif