Stable release of RBC
[RBC.git] / sKernel.cu
1 #ifndef SKERNEL_CU
2 #define SKERNEL_CU
3
4 #include<stdio.h>
5 #include "sKernel.h"
6 #include "defs.h"
7
8 __global__ void sumKernel(charMatrix in, intMatrix sum, intMatrix sumaux, int n){
9 int id = threadIdx.x;
10 int bo = blockIdx.x*SCAN_WIDTH; //block offset
11 int r = blockIdx.y;
12 int d, t;
13
14 const int l=SCAN_WIDTH; //length
15
16 int off=1;
17
18 __shared__ int ssum[l];
19
20
21 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
22 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
23
24 //up-sweep
25 for( d=l>>1; d > 0; d>>=1 ){
26 __syncthreads();
27
28 if( id < d ){
29 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
30 }
31 off *= 2;
32 }
33
34 __syncthreads();
35
36 if ( id == 0 ){
37 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
38 ssum[ l-1 ] = 0;
39 }
40
41 //down-sweep
42 for ( d=1; d<l; d*=2 ){
43 off >>= 1;
44 __syncthreads();
45
46 if( id < d ){
47 t = ssum[ off*(2*id+1)-1 ];
48 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
49 ssum[ off*(2*id+2)-1 ] += t;
50 }
51
52 }
53
54 __syncthreads();
55
56 if( bo+2*id < n )
57 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
58 if( bo+2*id+1 < n )
59 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
60
61 }
62
63 __global__ void combineSumKernel(intMatrix sum, intMatrix daux, int n){
64 int id = threadIdx.x;
65 int bo = blockIdx.x * SCAN_WIDTH;
66 int r = blockIdx.y;
67
68 if(bo+2*id < n)
69 sum.mat[IDX( r, bo+2*id, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
70 if(bo+2*id+1 < n)
71 sum.mat[IDX( r, bo+2*id+1, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
72
73 }
74
75
76 __global__ void getCountsKernel(int *counts, charMatrix ir, intMatrix sums){
77 int r = blockIdx.x*BLOCK_SIZE + threadIdx.x;
78 if ( r < ir.r )
79 counts[r] = ir.mat[IDX( r, ir.c-1, ir.ld )] ? sums.mat[IDX( r, sums.c-1, sums.ld )]+1 : sums.mat[IDX( r, sums.c-1, sums.ld )];
80 }
81
82
83 __global__ void buildMapKernel(intMatrix map, charMatrix ir, intMatrix sums, int offSet){
84 int id = threadIdx.x;
85 int bo = blockIdx.x * SCAN_WIDTH;
86 int r = blockIdx.y;
87
88 if(bo+2*id < ir.c && ir.mat[IDX( r, bo+2*id, ir.ld )])
89 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id, sums.ld )], map.ld)] = bo+2*id;
90 if(bo+2*id+1 < ir.c && ir.mat[IDX( r, bo+2*id+1, ir.ld )])
91 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id+1, sums.ld )], map.ld)] = bo+2*id+1;
92 }
93
94
95 void getCountsWrap(int *counts, charMatrix ir, intMatrix sums){
96 dim3 block(BLOCK_SIZE,1);
97 dim3 grid(ir.pr/BLOCK_SIZE,1);
98 getCountsKernel<<<grid,block>>>(counts, ir, sums);
99
100 }
101
102
103 void buildMapWrap(intMatrix map, charMatrix ir, intMatrix sums, int offSet){
104 int numScans = (ir.c+SCAN_WIDTH-1)/SCAN_WIDTH;
105 dim3 block( SCAN_WIDTH/2, 1 );
106 dim3 grid( numScans, ir.r );
107 buildMapKernel<<<grid,block>>>(map, ir, sums, offSet);
108 }
109
110
111 void sumWrap(charMatrix in, intMatrix sum){
112 int n = in.c;
113 int numScans = (n+SCAN_WIDTH-1)/SCAN_WIDTH;
114
115 if(numScans > SCAN_WIDTH){
116 fprintf(stderr,"scan is too large.. exiting \n");
117 exit(1);
118 }
119
120 intMatrix dAux;
121 dAux.r=dAux.pr=in.r; dAux.c=dAux.pc=dAux.ld=numScans;
122 intMatrix dDummy;
123 dDummy.r=dDummy.pr=in.r; dDummy.c=dDummy.pc=dDummy.ld=1;
124
125 cudaMalloc( (void**)&dAux.mat, dAux.pr*dAux.pc*sizeof(*dAux.mat) );
126 cudaMalloc( (void**)&dDummy.mat, dDummy.pr*dDummy.pc*sizeof(*dDummy.mat) );
127
128 dim3 block( SCAN_WIDTH/2, 1 );
129 dim3 grid( numScans, in.r );
130
131 sumKernel<<<grid,block>>>(in, sum, dAux, n);
132 cudaThreadSynchronize();
133 grid.x=1;
134
135 sumKernelI<<<grid,block>>>(dAux, dAux, dDummy, numScans);
136 cudaThreadSynchronize();
137
138 grid.x=numScans;
139 combineSumKernel<<<grid,block>>>(sum,dAux,n);
140 cudaThreadSynchronize();
141
142 cudaFree(dAux.mat);
143 cudaFree(dDummy.mat);
144
145 }
146
147
148 //This is the same as sumKernel, but takes an int matrix as input.
149 __global__ void sumKernelI(intMatrix in, intMatrix sum, intMatrix sumaux, int n){
150 int id = threadIdx.x;
151 int bo = blockIdx.x*SCAN_WIDTH; //block offset
152 int r = blockIdx.y;
153 int d, t;
154
155 const int l=SCAN_WIDTH; //length
156
157 int off=1;
158
159 __shared__ int ssum[l];
160
161
162 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
163 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
164
165 //up-sweep
166 for( d=l>>1; d > 0; d>>=1 ){
167 __syncthreads();
168
169 if( id < d ){
170 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
171 }
172 off *= 2;
173 }
174
175 __syncthreads();
176
177 if ( id == 0 ){
178 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
179 ssum[ l-1 ] = 0;
180 }
181
182 //down-sweep
183 for ( d=1; d<l; d*=2 ){
184 off >>= 1;
185 __syncthreads();
186
187 if( id < d ){
188 t = ssum[ off*(2*id+1)-1 ];
189 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
190 ssum[ off*(2*id+2)-1 ] += t;
191 }
192
193 }
194
195 __syncthreads();
196
197 if( bo+2*id < n )
198 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
199 if( bo+2*id+1 < n )
200 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
201
202 }
203
204 #endif