+ refactored code further
[qpalma.git] / scripts / Evaluation.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pydb
7 import pdb
8 import os
9 import os.path
10 import math
11
12
13 def evaluatePositions(eBegin,eEnd):
14 eBegin_pos = [elem for elem in eBegin if elem == 0]
15 eBegin_neg = [elem for elem in eBegin if elem != 0]
16 eEnd_pos = [elem for elem in eEnd if elem == 0]
17 eEnd_neg = [elem for elem in eEnd if elem != 0]
18
19 mean_eBegin_neg = 0
20 for idx in range(len(eBegin_neg)):
21 mean_eBegin_neg += eBegin_neg[idx]
22
23 try:
24 mean_eBegin_neg /= 1.0*len(eBegin_neg)
25 except:
26 mean_eBegin_neg = -1
27
28 mean_eEnd_neg = 0
29 for idx in range(len(eEnd_neg)):
30 mean_eEnd_neg += eEnd_neg[idx]
31
32 try:
33 mean_eEnd_neg /= 1.0*len(eEnd_neg)
34 except:
35 mean_eEnd_neg = -1
36
37 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
38
39
40 def perform_prediction(current_dir,run_name):
41 cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s | qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
42 #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
43 #print cmd
44 os.system(cmd)
45
46 def forall_experiments(current_func,tl_dir):
47 """
48 Given the toplevel directoy this function calls for each subdir the
49 function given as first argument
50 """
51
52 dir_entries = os.listdir(tl_dir)
53 dir_entries = [os.path.join(tl_dir,de) for de in dir_entries]
54 run_dirs = [de for de in dir_entries if os.path.isdir(de)]
55
56 for current_dir in run_dirs:
57 run_name = current_dir.split('/')[-1]
58 current_func(current_dir,run_name)
59
60
61 def collect_prediction(current_dir,run_name):
62 """
63 Given the toplevel directoy this function calls for each subdir the
64 """
65
66 train_suffix = '_allPredictions_TRAIN'
67 test_suffix = '_allPredictions_TEST'
68 jp = os.path.join
69
70 filename = jp(current_dir,run_name)+train_suffix
71 print 'Prediction on: %s' % filename
72 prediction_on(filename)
73
74 filename = jp(current_dir,run_name)+test_suffix
75 print 'Prediction on: %s' % filename
76 prediction_on(filename)
77
78
79 def prediction_on(filename):
80 allPredictions = cPickle.load(open(filename))
81
82 exon1Begin = []
83 exon1End = []
84 exon2Begin = []
85 exon2End = []
86 allWrongExons = []
87 allDoubleScores = []
88
89 all_pos_correct_ctr = 0
90
91 for current_example_pred in allPredictions:
92 for elem_nr,current_pred in enumerate(current_example_pred[:1]):
93
94 #gt_score = gt_example[]
95 #score = gt_example['DPScores'].flatten().tolist()[0][0]
96
97 #all_pos_correct = evaluate_example(current_pred)
98 all_pos_correct = evaluate_unmapped_example(current_pred)
99
100 if all_pos_correct:
101 all_pos_correct_ctr += 1
102
103 print 'Total num. of examples: %d' % len(allPredictions)
104 print 'Number of total correct examples: %d' % all_pos_correct_ctr
105
106 #print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
107 #print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
108 #print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
109
110 def evaluate_unmapped_example(current_prediction):
111
112 predExons = current_prediction['predExons']
113 trueExons = current_prediction['trueExons']
114
115 return compare_exons(predExons,trueExons)
116
117
118 def evaluate_example(current_prediction):
119
120 label = False
121
122 try:
123 label = current_prediction['label']
124 except:
125 pass
126
127 #predExons = current_prediction['predExons']
128 #trueExons = current_prediction['trueExons']
129 #pdb.set_trace()
130
131 if label:
132
133 start_pos = current_prediction['start_pos']
134 alternative_start_pos = current_prediction['alternative_start_pos']
135
136 predExons = current_prediction['predExons']
137 trueExons = current_prediction['trueExons']
138
139 #pdb.set_trace()
140
141 if start_pos <= alternative_start_pos:
142 start_offset = alternative_start_pos-start_pos
143 predExons += start_offset
144
145 return compare_exons(predExons,trueExons)
146
147 if alternative_start_pos <= start_pos:
148 start_offset = start_pos-alternative_start_pos
149 predExons += start_offset
150
151 return compare_exons(predExons,trueExons)
152
153 else:
154 pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
155
156 return False
157
158
159 def compare_exons(predExons,trueExons):
160 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
161
162 if len(predExons) == 4:
163 e1_begin,e1_end = predExons[0],predExons[1]
164 e2_begin,e2_end = predExons[2],predExons[3]
165 else:
166 return False
167
168 e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
169 e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
170
171 e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
172 e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
173
174 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
175 and e2_e_off == 0:
176 return True
177
178 return False
179
180
181 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
182 newExons = []
183 oldElem = -1
184 SpliceAlign = SpliceAlign.flatten().tolist()[0]
185 SpliceAlign.append(-1)
186 for pos,elem in enumerate(SpliceAlign):
187 if pos == 0:
188 oldElem = -1
189 else:
190 oldElem = SpliceAlign[pos-1]
191
192 if oldElem != 0 and elem == 0: # start of exon
193 newExons.append(pos)
194
195 if oldElem == 0 and elem != 0: # end of exon
196 newExons.append(pos)
197
198 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
199
200 if len(newExons) == 4:
201 e1_begin,e1_end = newExons[0],newExons[1]
202 e2_begin,e2_end = newExons[2],newExons[3]
203 else:
204 return None,None,None,None,newExons
205
206 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
207 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
208
209 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
210 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
211
212 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
213
214
215 if __name__ == '__main__':
216 dir = sys.argv[1]
217 assert os.path.exists(dir), 'Error directory does not exist!'
218
219 #forall_experiments(perform_prediction,dir)
220 forall_experiments(collect_prediction,dir)
221
222 """
223 if elem_nr > 0:
224 #print 'start positions'
225 #print current_pred['start_pos'], current_pred['alternative_start_pos']
226
227 if current_pred['label'] == False or (current_pred['label'] == True
228 and len(current_pred['predExons']) != 4):
229 if
230 current_example_pred[0]['DPScores'].flatten().tolist()[0][0]:
231 print current_pred['trueExons'][0,1]-current_pred['trueExons'][0,0],\
232 current_pred['trueExons'][1,1] - current_pred['trueExons'][1,0],\
233 current_pred['predExons']
234 print current_pred['DPScores'].flatten().tolist()[0][0],\
235 current_example_pred[0]['DPScores'].flatten().tolist()[0][0]
236 ctr += 1
237 print ctr
238
239 if e1_b_off != None:
240 exon1Begin.append(e1_b_off)
241 exon1End.append(e1_e_off)
242 exon2Begin.append(e2_b_off)
243 exon2End.append(e2_e_off)
244 else:
245 pass
246 #allWrongExons.append((newExons,exons))
247
248 if ambigous_match == True:
249 current_score = current_pred['DPScores'][0]
250 example_scores.append(current_score)
251
252 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = evaluatePositions(exon1Begin,exon1End)
253 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = evaluatePositions(exon2Begin,exon2End)
254
255 allDoubleScores.append(example_scores)
256 """