Cleaned up, debugged. Ready for 1st release
[RBC.git] / sKernel.cu
1 /* This file is part of the Random Ball Cover (RBC) library.
2 * (C) Copyright 2010, Lawrence Cayton [lcayton@tuebingen.mpg.de]
3 */
4
5 #ifndef SKERNEL_CU
6 #define SKERNEL_CU
7
8 #include<stdio.h>
9 #include<math.h>
10 #include "sKernel.h"
11 #include "defs.h"
12 #include "utils.h"
13
14 __global__ void sumKernel(charMatrix in, intMatrix sum, intMatrix sumaux, unint n){
15 unint id = threadIdx.x;
16 unint bo = blockIdx.x*SCAN_WIDTH; //block offset
17 unint r = blockIdx.y;
18 unint d, t;
19
20 const unint l=SCAN_WIDTH; //length
21
22 unint off=1;
23
24 __shared__ unint ssum[l];
25
26 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
27 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
28
29 //up-sweep
30 for( d=l>>1; d > 0; d>>=1 ){
31 __syncthreads();
32
33 if( id < d ){
34 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
35 }
36 off *= 2;
37 }
38
39 __syncthreads();
40
41 if ( id == 0 ){
42 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
43 ssum[ l-1 ] = 0;
44 }
45
46 //down-sweep
47 for ( d=1; d<l; d*=2 ){
48 off >>= 1;
49 __syncthreads();
50
51 if( id < d ){
52 t = ssum[ off*(2*id+1)-1 ];
53 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
54 ssum[ off*(2*id+2)-1 ] += t;
55 }
56 }
57
58 __syncthreads();
59
60 if( bo+2*id < n )
61 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
62 if( bo+2*id+1 < n )
63 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
64 }
65
66
67 //This is the same as sumKernel, but takes an int matrix as input.
68 __global__ void sumKernelI(intMatrix in, intMatrix sum, intMatrix sumaux, unint n){
69 unint id = threadIdx.x;
70 unint bo = blockIdx.x*SCAN_WIDTH; //block offset
71 unint r = blockIdx.y;
72 unint d, t;
73
74 const unint l=SCAN_WIDTH; //length
75
76 unint off=1;
77
78 __shared__ unint ssum[l];
79
80 ssum[2*id] = (bo+2*id < n) ? in.mat[IDX( r, bo+2*id, in.ld )] : 0;
81 ssum[2*id+1] = (bo+2*id+1 < n) ? in.mat[IDX( r, bo+2*id+1, in.ld)] : 0;
82
83 //up-sweep
84 for( d=l>>1; d > 0; d>>=1 ){
85 __syncthreads();
86
87 if( id < d ){
88 ssum[ off*(2*id+2)-1 ] += ssum[ off*(2*id+1)-1 ];
89 }
90 off *= 2;
91 }
92
93 __syncthreads();
94
95 if ( id == 0 ){
96 sumaux.mat[IDX( r, blockIdx.x, sumaux.ld )] = ssum[ l-1 ];
97 ssum[ l-1 ] = 0;
98 }
99
100 //down-sweep
101 for ( d=1; d<l; d*=2 ){
102 off >>= 1;
103 __syncthreads();
104
105 if( id < d ){
106 t = ssum[ off*(2*id+1)-1 ];
107 ssum[ off*(2*id+1)-1 ] = ssum[ off*(2*id+2)-1 ];
108 ssum[ off*(2*id+2)-1 ] += t;
109 }
110 }
111
112 __syncthreads();
113
114 if( bo+2*id < n )
115 sum.mat[IDX( r, bo+2*id, sum.ld )] = ssum[2*id];
116
117 if( bo+2*id+1 < n )
118 sum.mat[IDX( r, bo+2*id+1, sum.ld )] = ssum[2*id+1];
119 }
120
121
122
123 __global__ void combineSumKernel(intMatrix sum, unint numDone, intMatrix daux, unint n){
124 unint id = threadIdx.x;
125 unint bo = blockIdx.x * SCAN_WIDTH;
126 unint r = blockIdx.y+numDone;
127
128 if(bo+2*id < n)
129 sum.mat[IDX( r, bo+2*id, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
130 if(bo+2*id+1 < n)
131 sum.mat[IDX( r, bo+2*id+1, sum.ld )] += daux.mat[IDX( r, blockIdx.x, daux.ld )];
132
133 }
134
135
136 __global__ void getCountsKernel(unint *counts, unint numDone, charMatrix ir, intMatrix sums){
137 unint r = blockIdx.x*BLOCK_SIZE + threadIdx.x + numDone;
138 if ( r < ir.r ){
139 counts[r] = ir.mat[IDX( r, ir.c-1, ir.ld )] ? sums.mat[IDX( r, sums.c-1, sums.ld )]+1 : sums.mat[IDX( r, sums.c-1, sums.ld )];
140 }
141 }
142
143
144 __global__ void buildMapKernel(intMatrix map, charMatrix ir, intMatrix sums, unint offSet){
145 unint id = threadIdx.x;
146 unint bo = blockIdx.x * SCAN_WIDTH;
147 unint r = blockIdx.y;
148
149 if(bo+2*id < ir.c && ir.mat[IDX( r, bo+2*id, ir.ld )])
150 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id, sums.ld )], map.ld)] = bo+2*id;
151 if(bo+2*id+1 < ir.c && ir.mat[IDX( r, bo+2*id+1, ir.ld )])
152 map.mat[IDX( r+offSet, sums.mat[IDX( r, bo+2*id+1, sums.ld )], map.ld)] = bo+2*id+1;
153 }
154
155
156 #endif