+ minor changes in Evaluation an soap remapping wrapper
[qpalma.git] / scripts / Evaluation.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pydb
7 import pdb
8 import os
9 import os.path
10 import math
11
12
13 def compare_scores_and_labels(scores,labels):
14 """
15 Iterate through all predictions. If we find a correct prediction check
16 whether this correct prediction scores higher than the incorrect
17 predictions for this example.
18 """
19
20 for currentPos,currentElem in enumerate(scores):
21 if labels[currentPos] == True:
22 for otherPos,otherElem in enumerate(scores):
23 if otherPos == currentPos:
24 continue
25
26 if labels[otherPos] == False and otherElem > currentElem:
27 return False
28
29 return True
30
31
32 def evaluate_unmapped_example(current_prediction):
33 predExons = current_prediction['predExons']
34 trueExons = current_prediction['trueExons']
35
36 result = compare_exons(predExons,trueExons)
37 return result
38
39
40 def evaluate_example(current_prediction):
41 label = False
42 label = current_prediction['label']
43
44 pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
45
46 # if the read was mapped by vmatch at an incorrect position we only have to
47 # compare the score
48 if label == False:
49 return label,False,pred_score
50
51 predExons = current_prediction['predExons']
52 trueExons = current_prediction['trueExons']
53
54 predPositions = [elem + current_prediction['alternative_start_pos'] for elem in predExons]
55 truePositions = [elem + current_prediction['start_pos'] for elem in trueExons.flatten().tolist()[0]]
56
57 #pdb.set_trace()
58
59 pos_comparison = (predPositions == truePositions)
60
61 #if label == True and pos_comparison == False:
62 # pdb.set_trace()
63
64 return label,pos_comparison,pred_score
65
66
67 def compare_exons(predExons,trueExons):
68 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
69
70 if len(predExons) == 4:
71 e1_begin,e1_end = predExons[0],predExons[1]
72 e2_begin,e2_end = predExons[2],predExons[3]
73 else:
74 return False
75
76 e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
77 e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
78
79 e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
80 e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
81
82 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
83 and e2_e_off == 0:
84 return True
85
86 return False
87
88 def prediction_on(filename):
89 allPredictions = cPickle.load(open(filename))
90
91 exon1Begin = []
92 exon1End = []
93 exon2Begin = []
94 exon2End = []
95 allWrongExons = []
96 allDoubleScores = []
97
98 gt_correct_ctr = 0
99 pos_correct_ctr = 0
100 pos_incorrect_ctr = 0
101 score_correct_ctr = 0
102 score_incorrect_ctr = 0
103
104 total_vmatch_instances_ctr = 0
105 true_vmatch_instances_ctr = 0
106
107 for current_example_pred in allPredictions:
108 gt_example = current_example_pred[0]
109 gt_score = gt_example['DPScores'].flatten().tolist()[0][0]
110 gt_correct = evaluate_unmapped_example(gt_example)
111
112 current_scores = []
113 current_labels = []
114 current_scores.append(gt_score)
115 current_labels.append(gt_correct)
116
117 if gt_correct:
118 gt_correct_ctr += 1
119
120 for elem_nr,current_pred in enumerate(current_example_pred[1:]):
121 current_label,comparison_result,current_score = evaluate_example(current_pred)
122
123 # if vmatch found the right read pos we check for right exons
124 # boundaries
125 if current_label:
126 if comparison_result:
127 pos_correct_ctr += 1
128 else:
129 pos_incorrect_ctr += 1
130
131 true_vmatch_instances_ctr += 1
132
133
134 current_scores.append(current_score)
135 current_labels.append(current_label)
136
137 total_vmatch_instances_ctr += 1
138
139 # check whether the correct predictions score higher than the incorrect
140 # ones
141 cmp_res = compare_scores_and_labels(current_scores,current_labels)
142 if cmp_res:
143 score_correct_ctr += 1
144 else:
145 score_incorrect_ctr += 1
146
147 # now that we have evaluated all instances put out all counters and sizes
148 print 'Total num. of examples: %d' % len(allPredictions)
149 print 'Number of correct ground truth examples: %d' % gt_correct_ctr
150 print 'Total num. of true vmatch instances %d' % true_vmatch_instances_ctr
151 print 'Correct pos: %d, incorrect pos: %d' %\
152 (pos_correct_ctr,pos_incorrect_ctr)
153 print 'Total num. of vmatch instances %d' % total_vmatch_instances_ctr
154 print 'Correct scores: %d, incorrect scores: %d' %\
155 (score_correct_ctr,score_incorrect_ctr)
156
157 #print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
158 #print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
159 #print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
160
161
162 def collect_prediction(current_dir,run_name):
163 """
164 Given the toplevel directoy this function takes care that for each distinct
165 experiment the training and test predictions are evaluated.
166
167 """
168 train_suffix = '_allPredictions_TRAIN'
169 test_suffix = '_allPredictions_TEST'
170 jp = os.path.join
171
172 filename = jp(current_dir,run_name)+train_suffix
173 print 'Prediction on: %s' % filename
174 prediction_on(filename)
175
176 filename = jp(current_dir,run_name)+test_suffix
177 print 'Prediction on: %s' % filename
178 prediction_on(filename)
179
180
181 def perform_prediction(current_dir,run_name):
182 """
183 This function takes care of starting the jobs needed for the prediction phase
184 of qpalma
185 """
186 cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s | qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
187 #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
188 #print cmd
189 os.system(cmd)
190
191
192 def forall_experiments(current_func,tl_dir):
193 """
194 Given the toplevel directoy this function calls for each subdir the
195 function given as first argument. Which are at the moment:
196
197 - perform_prediction, and
198 - collect_prediction.
199
200 """
201
202 dir_entries = os.listdir(tl_dir)
203 dir_entries = [os.path.join(tl_dir,de) for de in dir_entries]
204 run_dirs = [de for de in dir_entries if os.path.isdir(de)]
205
206 for current_dir in run_dirs:
207 run_name = current_dir.split('/')[-1]
208 current_func(current_dir,run_name)
209
210
211 if __name__ == '__main__':
212 dir = sys.argv[1]
213 assert os.path.exists(dir), 'Error directory does not exist!'
214
215 forall_experiments(perform_prediction,dir)
216 #forall_experiments(collect_prediction,dir)
217
218 """
219 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
220 newExons = []
221 oldElem = -1
222 SpliceAlign = SpliceAlign.flatten().tolist()[0]
223 SpliceAlign.append(-1)
224 for pos,elem in enumerate(SpliceAlign):
225 if pos == 0:
226 oldElem = -1
227 else:
228 oldElem = SpliceAlign[pos-1]
229
230 if oldElem != 0 and elem == 0: # start of exon
231 newExons.append(pos)
232
233 if oldElem == 0 and elem != 0: # end of exon
234 newExons.append(pos)
235
236 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
237
238 if len(newExons) == 4:
239 e1_begin,e1_end = newExons[0],newExons[1]
240 e2_begin,e2_end = newExons[2],newExons[3]
241 else:
242 return None,None,None,None,newExons
243
244 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
245 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
246
247 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
248 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
249
250 pdb.set_trace()
251
252 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
253 """
254
255 """
256 def evaluatePositions(eBegin,eEnd):
257 eBegin_pos = [elem for elem in eBegin if elem == 0]
258 eBegin_neg = [elem for elem in eBegin if elem != 0]
259 eEnd_pos = [elem for elem in eEnd if elem == 0]
260 eEnd_neg = [elem for elem in eEnd if elem != 0]
261
262 mean_eBegin_neg = 0
263 for idx in range(len(eBegin_neg)):
264 mean_eBegin_neg += eBegin_neg[idx]
265
266 try:
267 mean_eBegin_neg /= 1.0*len(eBegin_neg)
268 except:
269 mean_eBegin_neg = -1
270
271 mean_eEnd_neg = 0
272 for idx in range(len(eEnd_neg)):
273 mean_eEnd_neg += eEnd_neg[idx]
274
275 try:
276 mean_eEnd_neg /= 1.0*len(eEnd_neg)
277 except:
278 mean_eEnd_neg = -1
279
280 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
281 """