fixed bug in driver related to text input; fixed bugs related to size_t
[RBC.git] / sKernelWrap.cu
1 /* This file is part of the Random Ball Cover (RBC) library.
2 * (C) Copyright 2010, Lawrence Cayton [lcayton@tuebingen.mpg.de]
3 */
4 #ifndef SKERNELWRAP_CU
5 #define SKERNELWRAP_CU
6
7 #include "sKernel.h"
8 #include<cuda.h>
9 #include "defs.h"
10 #include "utilsGPU.h"
11 #include<stdio.h>
12
13 void getCountsWrap(unint *counts, charMatrix ir, intMatrix sums){
14 dim3 block(BLOCK_SIZE,1);
15 dim3 grid;
16 grid.y=1;
17 unint todo, numDone;
18
19 numDone = 0;
20 while(numDone < ir.pr){
21 todo = MIN( ir.pr - numDone, MAX_BS*BLOCK_SIZE );
22 grid.x = todo/BLOCK_SIZE;
23 getCountsKernel<<<grid,block>>>(counts, numDone, ir, sums);
24 numDone += todo;
25 }
26 }
27
28
29 void buildMapWrap(intMatrix map, charMatrix ir, intMatrix sums, unint offSet){
30 unint numScans = (ir.c+SCAN_WIDTH-1)/SCAN_WIDTH;
31 dim3 block( SCAN_WIDTH/2, 1 );
32 dim3 grid;
33 unint todo, numDone;
34
35 grid.x = numScans;
36 numDone = 0;
37 while( numDone < ir.r ){
38 todo = MIN( ir.r-numDone, MAX_BS );
39 grid.y = todo;
40 buildMapKernel<<<grid,block>>>(map, ir, sums, offSet+numDone);
41 numDone += todo;
42 }
43 }
44
45
46 void sumWrap(charMatrix in, intMatrix sum){
47 int i;
48 unint todo, numDone, temp;
49 unint n = in.c;
50 unint numScans = (n+SCAN_WIDTH-1)/SCAN_WIDTH;
51 unint depth = ceil( log(n) / log(SCAN_WIDTH) ) -1 ;
52 unint *width = (unint*)calloc( depth+1, sizeof(*width) );
53
54 intMatrix *dAux;
55 dAux = (intMatrix*)calloc( depth+1, sizeof(*dAux) );
56
57 for( i=0, temp=n; i<=depth; i++){
58 temp = (temp+SCAN_WIDTH-1)/SCAN_WIDTH;
59 dAux[i].r=dAux[i].pr=in.r; dAux[i].c=dAux[i].pc=dAux[i].ld=temp;
60 checkErr( cudaMalloc( (void**)&dAux[i].mat, dAux[i].pr*dAux[i].pc*sizeof(*dAux[i].mat) ) );
61 }
62
63 dim3 block( SCAN_WIDTH/2, 1 );
64 dim3 grid;
65
66 numDone=0;
67 while( numDone < in.r ){
68 todo = MIN( in.r - numDone, MAX_BS );
69 numScans = (n+SCAN_WIDTH-1)/SCAN_WIDTH;
70 dAux[0].r=dAux[0].pr=todo;
71 grid.x = numScans;
72 grid.y = todo;
73 sumKernel<<<grid,block>>>(in, sum, dAux[0], n);
74 cudaThreadSynchronize();
75
76 width[0] = numScans; // Necessary because following loop might not be entered
77 for( i=0; i<depth; i++ ){
78 width[i] = numScans;
79 numScans = (numScans+SCAN_WIDTH-1)/SCAN_WIDTH;
80 dAux[i+1].r=dAux[i+1].pr=todo;
81
82 grid.x = numScans;
83 sumKernelI<<<grid,block>>>(dAux[i], dAux[i], dAux[i+1], width[i]);
84 cudaThreadSynchronize();
85 }
86
87 for( i=depth-1; i>0; i-- ){
88 grid.x = width[i];
89 combineSumKernel<<<grid,block>>>(dAux[i-1], numDone, dAux[i], width[i-1]);
90 cudaThreadSynchronize();
91 }
92
93 grid.x = width[0];
94 combineSumKernel<<<grid,block>>>(sum, numDone, dAux[0], n);
95 cudaThreadSynchronize();
96
97 numDone += todo;
98 }
99
100 for( i=0; i<=depth; i++)
101 cudaFree(dAux[i].mat);
102 free(dAux);
103 free(width);
104 }
105
106
107 #endif