9a24597d244a09f11bde08cae2a2c1f0a6b02eef
[qpalma.git] / tools / dataset_scripts / dataStatistic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5
6 #
7 #
8 #
9
10 class Data:
11 pass
12
13
14
15
16 class DataStatistic:
17
18 def __init__(self,reads_fn,map_fn,map2_fn,heuristic_fn,heuristic_un_fn):
19 self.reads_fn = reads_fn
20 self.map_fn = map_fn
21 self.map2_fn = map2_fn
22 self.heuristic_fn = heuristic_fn
23 self.heuristic_un_fn = heuristic_un_fn
24
25 #self.all_read_ids_set = set([])
26 #self.spliced_ids_set = set([])
27 #self.unspliced_ids_set = set([])
28 #self.map_ids_set = set([])
29 #self.map_2nd_ids_set = set([])
30
31
32 def calcStat(self,filename,check_strand):
33 spliced_set = set([])
34 unspliced_set = set([])
35 all_set = set([])
36
37 for line in open(filename):
38 sl = line.strip().split()
39 id = int(sl[0])
40 chromo = int(sl[1])
41 strand = sl[3]
42
43 if check_strand and ((not chromo in range(1,6)) or strand != 'D'):
44 continue
45
46 if id < 1000000300000:
47 spliced_set.add(id)
48 else:
49 unspliced_set.add(id)
50
51 all_set.add(id)
52
53 return spliced_set,unspliced_set,all_set
54
55
56 def calculate(self):
57 all_spliced_set,all_unspliced_set,all_set = self.calcStat(self.reads_fn,False)
58 res = (len(all_spliced_set),len(all_unspliced_set),len(all_set),'total reads')
59 self.printResult(res)
60
61 check_switch = True
62 map_spliced_set,map_unspliced_set,map_set =\
63 self.calcStat(self.map_fn,check_switch)
64 res = (len(map_spliced_set),len(map_unspliced_set),len(map_set),'map reads')
65 self.printResult(res)
66
67 map2_spliced_set,map2_unspliced_set,map2_set =\
68 self.calcStat(self.map2_fn,check_switch)
69 res = (len(map2_spliced_set),len(map2_unspliced_set),len(map2_set),'map2 reads')
70 self.printResult(res)
71
72 heuristic_spliced_set,heuristic_unspliced_set,heuristic_set =\
73 self.calcStat(self.heuristic_fn,check_switch)
74 res = (len(heuristic_spliced_set),len(heuristic_unspliced_set),len(heuristic_set),'heuristic reads')
75 self.printResult(res)
76
77 heuristic_un_spliced_set,heuristic_un_unspliced_set,heuristic_un_set =\
78 self.calcStat(self.heuristic_un_fn,check_switch)
79 res = (len(heuristic_un_spliced_set),len(heuristic_un_unspliced_set),len(heuristic_un_set),'heuristic_un reads')
80 self.printResult(res)
81
82 all_found_set = map_set | map2_set
83 print 'found %d reads' % len(all_found_set)
84
85 not_found_set = all_set - all_found_set
86 print 'could not find %d reads' % len(not_found_set)
87
88 all_found_spliced_set = map_spliced_set | map2_spliced_set
89 print 'found %d spliced reads' % len(all_found_spliced_set)
90
91 not_found_spliced_set = all_spliced_set - all_found_spliced_set
92 print 'could not find %d spliced reads' % len(not_found_spliced_set)
93
94
95 all_found_unspliced_set = map_unspliced_set | map2_unspliced_set
96 not_found_unspliced_set = all_unspliced_set - all_found_unspliced_set
97 print 'could not find %d unspliced reads' % len(not_found_unspliced_set)
98 for i in range(10):
99 print not_found_unspliced_set.pop()
100
101
102 def printResult(self,res):
103 mes = '%s:\t%d\n' % (res[3],res[2])
104 mes += 'spliced reads:\t%d (%f)\n' % (res[0],(1.0*res[0])/res[2])
105 mes += 'unspliced reads:\t%d (%f)\n' % (res[1],(1.0*res[1])/res[2])
106 print mes
107
108
109 if __name__ == '__main__':
110
111 ds = DataStatistic(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
112 ds.calculate()