+ added script which draws the coverage bar plots presented in the QPalma paper
[qpalma.git] / scripts / Evaluation.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10
11 from qpalma.parsers import *
12
13
14 data = None
15
16
17 def result_statistic():
18 """
19
20 """
21
22 num_unaligned_reads = 0
23 num_incorrectly_aligned_reads = 0
24 pass
25
26
27 def createErrorVSCutPlot(results):
28 """
29 This function takes the results of the evaluation and creates a tex table.
30 """
31
32 fh = open('error_rates_table.tex','w+')
33 lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
34 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
35 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
36
37 #for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
38 for pos,key in enumerate(['+++']):
39 res = results[key]
40 for i in range(37):
41 ctr = 0
42 try:
43 ctr = res[1][i]
44 except:
45 ctr = 0
46
47 lines.append( '%d\n' % ctr)
48
49 if pos % 2 == 1:
50 lines.append('\hline')
51
52 lines.append('\end{tabular}')
53
54 lines = [l+'\n' for l in lines]
55 for l in lines:
56 fh.write(l)
57 fh.close()
58
59
60 def createTable(results):
61 """
62 This function takes the results of the evaluation and creates a tex table.
63 """
64
65 fh = open('result_table.tex','w+')
66 lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
67 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
68 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
69
70 for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
71 res = [e*100 for e in results[key]]
72
73 lines.append( '%s & %s & %s & %2.2f & %2.2f \\%%\\\\' % ( key[0], key[1], key[2], res[0], res[1] ) )
74 if pos % 2 == 1:
75 lines.append('\hline')
76
77 for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
78 res = [e*100 for e in results[key]]
79
80 lines.append( '%s & %s & %s & %2.2f & x \\%%\\\\' % ( key[0], key[1], key[2], res[2] ) )
81 if pos % 2 == 1:
82 lines.append('\hline')
83
84 lines.append('\end{tabular}')
85
86 lines = [l+'\n' for l in lines]
87 for l in lines:
88 fh.write(l)
89 fh.close()
90
91
92 def compare_scores_and_labels(scores,labels):
93 """
94 Iterate through all predictions. If we find a correct prediction check
95 whether this correct prediction scores higher than the incorrect
96 predictions for this example.
97 """
98
99 for currentPos,currentElem in enumerate(scores):
100 if labels[currentPos] == True:
101 for otherPos,otherElem in enumerate(scores):
102 if otherPos == currentPos:
103 continue
104
105 if labels[otherPos] == False and otherElem >= currentElem:
106 return False
107
108 return True
109
110
111 def compare_exons(predExons,trueExons):
112 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
113
114 if len(predExons) == 4:
115 e1_begin,e1_end = predExons[0],predExons[1]
116 e2_begin,e2_end = predExons[2],predExons[3]
117 else:
118 return False
119
120 e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
121 e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
122
123 e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
124 e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
125
126 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
127 and e2_e_off == 0:
128 return True
129
130 return False
131
132
133 def evaluate_unmapped_example(current_prediction):
134 predExons = current_prediction['predExons']
135 trueExons = current_prediction['trueExons']
136
137 result = compare_exons(predExons,trueExons)
138 return result
139
140
141 def evaluate_example(current_prediction):
142 label = False
143 label = current_prediction['label']
144
145 pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
146
147 # if the read was mapped by vmatch at an incorrect position we only have to
148 # compare the score
149 if label == False:
150 return label,False,pred_score
151
152 predExons = current_prediction['predExons']
153 trueExons = current_prediction['trueExons']
154
155 predPositions = [elem + current_prediction['alternative_start_pos'] for elem in predExons]
156 truePositions = [elem + current_prediction['start_pos'] for elem in trueExons.flatten().tolist()[0]]
157
158 pos_comparison = (predPositions == truePositions)
159
160 return label,pos_comparison,pred_score
161
162
163 def prediction_on(filename):
164
165 allPredictions = cPickle.load(open(filename))
166
167 gt_correct_ctr = 0
168 gt_incorrect_ctr = 0
169 incorrect_gt_cuts = {}
170
171 pos_correct_ctr = 0
172 pos_incorrect_ctr = 0
173 incorrect_vmatch_cuts = {}
174
175 score_correct_ctr = 0
176 score_incorrect_ctr = 0
177
178 total_gt_examples = 0
179 total_vmatch_instances_ctr = 0
180
181 true_vmatch_instances_ctr = 0
182
183 allUniquePredictions = [False]*len(allPredictions)
184
185 for pos,current_example_pred in enumerate(allPredictions):
186 for elem_nr,new_prediction in enumerate(current_example_pred[1:]):
187
188 if allUniquePredictions[pos] != False:
189 current_prediction = allUniquePredictions[pos]
190
191 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
192 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
193
194 if current_a_score < new_score :
195 allUniquePredictions[id] = new_prediction
196
197 else:
198 allUniquePredictions[pos] = new_prediction
199
200 for current_pred in allUniquePredictions:
201 if current_pred == False:
202 continue
203
204 #for current_example_pred in allPredictions:
205 #gt_example = current_example_pred[0]
206 #gt_score = gt_example['DPScores'].flatten().tolist()[0][0]
207 #gt_correct = evaluate_unmapped_example(gt_example)
208
209 #exampleIdx = gt_example['exampleIdx']
210
211 #cut_pos = gt_example['true_cut']
212
213 #if gt_correct:
214 # gt_correct_ctr += 1
215 #else:
216 # gt_incorrect_ctr += 1
217
218 # try:
219 # incorrect_gt_cuts[cut_pos] += 1
220 # except:
221 # incorrect_gt_cuts[cut_pos] = 1
222
223 #total_gt_examples += 1
224
225 #current_scores = []
226 #current_labels = []
227 #for elem_nr,current_pred in enumerate(current_example_pred[1:]):
228
229 current_label,comparison_result,current_score = evaluate_example(current_pred)
230
231 # if vmatch found the right read pos we check for right exons
232 # boundaries
233 #if current_label:
234 if comparison_result:
235 pos_correct_ctr += 1
236 else:
237 pos_incorrect_ctr += 1
238
239 #try:
240 # incorrect_vmatch_cuts[cut_pos] += 1
241 #except:
242 # incorrect_vmatch_cuts[cut_pos] = 1
243
244 true_vmatch_instances_ctr += 1
245
246 #current_scores.append(current_score)
247 #current_labels.append(current_label)
248
249 total_vmatch_instances_ctr += 1
250
251 # check whether the correct predictions score higher than the incorrect
252 # ones
253 #cmp_res = compare_scores_and_labels(current_scores,current_labels)
254 #if cmp_res:
255 # score_correct_ctr += 1
256 #else:
257 # score_incorrect_ctr += 1
258
259 # now that we have evaluated all instances put out all counters and sizes
260 print 'Total num. of examples: %d' % len(allPredictions)
261 print 'Number of correct ground truth examples: %d' % gt_correct_ctr
262 print 'Total num. of true vmatch instances %d' % true_vmatch_instances_ctr
263 print 'Correct pos: %d, incorrect pos: %d' % (pos_correct_ctr,pos_incorrect_ctr)
264 print 'Total num. of vmatch instances %d' % total_vmatch_instances_ctr
265 print 'Correct scores: %d, incorrect scores: %d' %\
266 (score_correct_ctr,score_incorrect_ctr)
267
268 pos_error = 1.0 * pos_incorrect_ctr / total_vmatch_instances_ctr
269 score_error = 1.0 * score_incorrect_ctr / total_vmatch_instances_ctr
270 gt_error = 1.0 * gt_incorrect_ctr / total_gt_examples
271
272 return (pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts)
273
274
275 def collect_prediction(current_dir,run_name):
276 """
277 Given the toplevel directory this function takes care that for each distinct
278 experiment the training and test predictions are evaluated.
279
280 """
281 idx = 5
282
283 train_suffix = '_%d_allPredictions_TRAIN' % (idx)
284 test_suffix = '_%d_allPredictions_TEST' % (idx)
285
286 jp = os.path.join
287 b2s = ['-','+']
288
289 currentRun = cPickle.load(open(jp(current_dir,'run_object_%d.pickle'%(idx))))
290 QFlag = currentRun['enable_quality_scores']
291 SSFlag = currentRun['enable_splice_signals']
292 ILFlag = currentRun['enable_intron_length']
293 currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
294
295 #filename = jp(current_dir,run_name)+train_suffix
296 #print 'Prediction on: %s' % filename
297 #train_result = prediction_on(filename)
298 train_result = []
299
300 filename = jp(current_dir,run_name)+test_suffix
301 print 'Prediction on: %s' % filename
302 test_result = prediction_on(filename)
303
304 return train_result,test_result,currentRunId
305
306
307 def perform_prediction(current_dir,run_name):
308 """
309 This function takes care of starting the jobs needed for the prediction phase
310 of qpalma
311 """
312
313 #for i in range(1,6):
314 for i in range(1,2):
315 cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s %d |\
316 qsub -l h_vmem=12.0G -cwd -j y -N \"%s_%d.log\"'%(current_dir,i,run_name,i)
317
318 #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
319 #print cmd
320 os.system(cmd)
321
322
323
324 def forall_experiments(current_func,tl_dir):
325 """
326 Given the toplevel directoy this function calls for each subdir the
327 function given as first argument. Which are at the moment:
328
329 - perform_prediction, and
330 - collect_prediction.
331
332 """
333
334 dir_entries = os.listdir(tl_dir)
335 dir_entries = [os.path.join(tl_dir,de) for de in dir_entries]
336 run_dirs = [de for de in dir_entries if os.path.isdir(de)]
337
338 all_results = {}
339 all_error_rates = {}
340
341 for current_dir in run_dirs:
342 run_name = current_dir.split('/')[-1]
343
344 pdb.set_trace()
345
346 if current_func.__name__ == 'perform_prediction':
347 current_func(current_dir,run_name)
348
349 if current_func.__name__ == 'collect_prediction':
350 train_result,test_result,currentRunId = current_func(current_dir,run_name)
351 all_results[currentRunId] = test_result
352 pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts = test_result
353 all_error_rates[currentRunId] = (incorrect_gt_cuts,incorrect_vmatch_cuts)
354
355 if current_func.__name__ == 'collect_prediction':
356 #createErrorVSCutPlot(all_error_rates)
357 createTable(all_results)
358
359 def _predict_on(filename,filtered_reads,with_coverage):
360 """
361 This function evaluates the predictions made by QPalma.
362 It needs a pickled file containing the predictions themselves and the
363 ascii file with original reads.
364
365 Optionally one can specifiy a coverage file containing for each read the
366 coverage number estimated by a remapping step.
367
368 """
369
370 coverage_map = {}
371
372 if with_coverage:
373 for line in open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/coverage_results/ALL_COVERAGES'):
374 id,coverage_nr = line.strip().split()
375 coverage_map[int(id)] = int(coverage_nr)
376
377 print 'parsing filtered reads..'
378 all_filtered_reads = parse_filtered_reads(filtered_reads)
379 print 'found %d filtered reads' % len(all_filtered_reads)
380
381 out_fh = open('predicted_positions.txt','w+')
382
383 allPredictions = cPickle.load(open(filename))
384
385 spliced_ctr = 0
386 unspliced_ctr = 0
387
388 pos_correct_ctr = 0
389 pos_incorrect_ctr = 0
390
391 correct_spliced_ctr = 0
392 correct_unspliced_ctr = 0
393
394 incorrect_spliced_ctr = 0
395 incorrect_unspliced_ctr = 0
396
397 correct_covered_splice_ctr = 0
398 incorrect_covered_splice_ctr = 0
399
400 total_vmatch_instances_ctr = 0
401
402 unspliced_spliced_reads_ctr = 0
403 wrong_spliced_reads_ctr = 0
404
405 wrong_aligned_unspliced_reads_ctr = 0
406 wrong_unspliced_reads_ctr = 0
407
408 cut_pos_ctr = {}
409
410 total_ctr = 0
411 skipped_ctr = 0
412
413 is_spliced = False
414 min_coverage = 3
415
416 allUniqPredictions = {}
417
418 print 'Got %d predictions' % len(allPredictions)
419
420 for new_prediction in allPredictions:
421 id = new_prediction['id']
422 id = int(id)
423
424 if allUniqPredictions.has_key(id):
425 current_prediction = allUniqPredictions[id]
426
427 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
428 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
429
430 if current_a_score < new_score :
431 allUniqPredictions[id] = new_prediction
432
433 else:
434 allUniqPredictions[id] = new_prediction
435
436 print 'Got %d uniq predictions' % len(allUniqPredictions)
437
438 #for current_prediction in allPredictions:
439 for _id,current_prediction in allUniqPredictions.items():
440 id = current_prediction['id']
441 id = int(id)
442
443 if not id >= 1000000300000:
444 is_spliced = True
445 else:
446 is_spliced = False
447
448 is_covered = False
449
450 if is_spliced and with_coverage:
451 try:
452 current_coverage_nr = coverage_map[id]
453 is_covered = True
454 except:
455 is_covered = False
456
457
458 if is_spliced:
459 spliced_ctr += 1
460 else:
461 unspliced_ctr += 1
462
463 try:
464 current_ground_truth = all_filtered_reads[id]
465 except:
466 skipped_ctr += 1
467 continue
468
469 start_pos = current_prediction['start_pos']
470 chr = current_prediction['chr']
471 strand = current_prediction['strand']
472
473 #score = current_prediction['DPScores'].flatten().tolist()[0][0]
474 #pdb.set_trace()
475
476 predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
477 predExons = [e+start_pos for e in predExons]
478
479 spliced_flag = False
480
481 if len(predExons) == 4:
482 spliced_flag = True
483 predExons[1] -= 1
484 predExons[3] -= 1
485
486 if predExons[0] == 19504568:
487 pdb.set_trace()
488
489 cut_pos = current_ground_truth['true_cut']
490 p_start = current_ground_truth['p_start']
491 e_stop = current_ground_truth['exon_stop']
492 e_start = current_ground_truth['exon_start']
493 p_stop = current_ground_truth['p_stop']
494
495 true_cut = current_ground_truth['true_cut']
496
497 if p_start == predExons[0] and e_stop == predExons[1] and\
498 e_start == predExons[2] and p_stop == predExons[3]:
499 pos_correct = True
500 else:
501 pos_correct = False
502
503 elif len(predExons) == 2:
504 spliced_flag = False
505 predExons[1] -= 1
506
507 cut_pos = current_ground_truth['true_cut']
508 p_start = current_ground_truth['p_start']
509 p_stop = current_ground_truth['p_stop']
510
511 true_cut = current_ground_truth['true_cut']
512
513 if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
514 pos_correct = True
515 else:
516 pos_correct = False
517
518 else:
519 pos_correct = False
520
521 if is_spliced and not spliced_flag:
522 unspliced_spliced_reads_ctr += 1
523
524 if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
525 wrong_spliced_reads_ctr += 1
526
527 if not is_spliced and spliced_flag:
528 wrong_unspliced_reads_ctr += 1
529
530 if not is_spliced and not pos_correct:
531 wrong_aligned_unspliced_reads_ctr += 1
532
533 if pos_correct:
534 pos_correct_ctr += 1
535
536 if is_spliced:
537 correct_spliced_ctr += 1
538 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
539 correct_covered_splice_ctr += 1
540
541 if not is_spliced:
542 correct_unspliced_ctr += 1
543
544 else:
545 pos_incorrect_ctr += 1
546
547 if is_spliced:
548 incorrect_spliced_ctr += 1
549 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
550 incorrect_covered_splice_ctr += 1
551
552 if not is_spliced:
553 incorrect_unspliced_ctr += 1
554
555 if with_coverage and spliced_flag:
556 if not is_covered:
557 current_coverage_nr=0
558 if pos_correct:
559 print "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
560 else:
561 print "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
562
563 total_ctr += 1
564
565
566 numPredictions = len(allUniqPredictions)
567
568 # now that we have evaluated all instances put out all counters and sizes
569 print 'Total num. of examples: %d' % numPredictions
570
571 print "spliced/unspliced: %d,%d " % (spliced_ctr, unspliced_ctr )
572 print "Correct/incorrect spliced: %d,%d " % (correct_spliced_ctr, incorrect_spliced_ctr )
573 print "Correct/incorrect unspliced: %d,%d " % (correct_unspliced_ctr , incorrect_unspliced_ctr )
574 print "Correct/incorrect covered spliced read: %d,%d " %\
575 (correct_covered_splice_ctr,incorrect_covered_splice_ctr)
576
577 print "pos_correct: %d,%d" % (pos_correct_ctr , pos_incorrect_ctr )
578
579 print 'unspliced_spliced reads: %d' % unspliced_spliced_reads_ctr
580 print 'spliced reads at wrong_place: %d' % wrong_spliced_reads_ctr
581
582 print 'spliced_unspliced reads: %d' % wrong_unspliced_reads_ctr
583 print 'wrong aligned at wrong_pos: %d' % wrong_aligned_unspliced_reads_ctr
584
585 print 'total_ctr: %d' % total_ctr
586
587 print "skipped: %d " % skipped_ctr
588 print 'min. coverage: %d' % min_coverage
589
590 result_dict = {}
591 result_dict['skipped_ctr'] = skipped_ctr
592 result_dict['min_coverage'] = min_coverage
593
594 return result_dict
595
596
597
598
599
600 def predict_on(allPredictions,all_filtered_reads,with_coverage,coverage_fn,coverage_labels_fn):
601 """
602 This function evaluates the predictions made by QPalma.
603 It needs a pickled file containing the predictions themselves and the
604 ascii file with original reads.
605
606 Optionally one can specifiy a coverage file containing for each read the
607 coverage number estimated by a remapping step.
608
609
610 """
611
612 coverage_labels_fh = open(coverage_labels_fn,'w+')
613
614 import qparser
615 qparser.parse_reads(all_filtered_reads)
616
617 coverage_map = {}
618
619
620 if with_coverage:
621 for line in open(coverage_fn):
622 id,coverage_nr = line.strip().split()
623 coverage_map[int(id)] = int(coverage_nr)
624
625 #out_fh = open('predicted_positions.txt','w+')
626
627 spliced_ctr = 0
628 unspliced_ctr = 0
629
630 pos_correct_ctr = 0
631 pos_incorrect_ctr = 0
632
633 correct_spliced_ctr = 0
634 correct_unspliced_ctr = 0
635
636 incorrect_spliced_ctr = 0
637 incorrect_unspliced_ctr = 0
638
639 correct_covered_splice_ctr = 0
640 incorrect_covered_splice_ctr = 0
641
642 total_vmatch_instances_ctr = 0
643
644 unspliced_spliced_reads_ctr = 0
645 wrong_spliced_reads_ctr = 0
646
647 wrong_aligned_unspliced_reads_ctr = 0
648 wrong_unspliced_reads_ctr = 0
649
650 cut_pos_ctr = {}
651
652 total_ctr = 0
653 skipped_ctr = 0
654
655 is_spliced = False
656 min_coverage = 3
657
658 allUniqPredictions = {}
659
660 print 'Got %d predictions' % len(allPredictions)
661
662 for k,predictions in allPredictions.items():
663 for new_prediction in predictions:
664 id = new_prediction['id']
665 id = int(id)
666
667 if allUniqPredictions.has_key(id):
668 current_prediction = allUniqPredictions[id]
669
670 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
671 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
672
673 if current_a_score < new_score :
674 allUniqPredictions[id] = new_prediction
675
676 else:
677 allUniqPredictions[id] = new_prediction
678
679 print 'Got %d uniq predictions' % len(allUniqPredictions)
680
681 for _id,current_prediction in allUniqPredictions.items():
682 id = current_prediction['id']
683 id = int(id)
684
685 if not id >= 1000000300000:
686 is_spliced = True
687 else:
688 is_spliced = False
689
690 is_covered = False
691
692 if is_spliced and with_coverage:
693 try:
694 current_coverage_nr = coverage_map[id]
695 is_covered = True
696 except:
697 is_covered = False
698
699
700 if is_spliced:
701 spliced_ctr += 1
702 else:
703 unspliced_ctr += 1
704
705 try:
706 #current_ground_truth = all_filtered_reads[id]
707 current_ground_truth = qparser.fetch_read(id)
708 except:
709 skipped_ctr += 1
710 continue
711
712 start_pos = current_prediction['start_pos']
713 chr = current_prediction['chr']
714 strand = current_prediction['strand']
715
716 #score = current_prediction['DPScores'].flatten().tolist()[0][0]
717 #pdb.set_trace()
718
719 predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
720 predExons = [e+start_pos for e in predExons]
721
722 spliced_flag = False
723
724 if len(predExons) == 4:
725 spliced_flag = True
726 predExons[1] -= 1
727 predExons[3] -= 1
728
729 #if predExons[0] == 19504568:
730 # pdb.set_trace()
731
732 cut_pos = current_ground_truth['true_cut']
733 p_start = current_ground_truth['p_start']
734 e_stop = current_ground_truth['exon_stop']
735 e_start = current_ground_truth['exon_start']
736 p_stop = current_ground_truth['p_stop']
737
738 true_cut = current_ground_truth['true_cut']
739
740 if p_start == predExons[0] and e_stop == predExons[1] and\
741 e_start == predExons[2] and p_stop == predExons[3]:
742 pos_correct = True
743 else:
744 pos_correct = False
745
746 elif len(predExons) == 2:
747 spliced_flag = False
748 predExons[1] -= 1
749
750 cut_pos = current_ground_truth['true_cut']
751 p_start = current_ground_truth['p_start']
752 p_stop = current_ground_truth['p_stop']
753
754 true_cut = current_ground_truth['true_cut']
755
756 if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
757 pos_correct = True
758 else:
759 pos_correct = False
760
761 else:
762 pos_correct = False
763
764 if is_spliced and not spliced_flag:
765 unspliced_spliced_reads_ctr += 1
766
767 if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
768 wrong_spliced_reads_ctr += 1
769
770 if not is_spliced and spliced_flag:
771 wrong_unspliced_reads_ctr += 1
772
773 if not is_spliced and not pos_correct:
774 wrong_aligned_unspliced_reads_ctr += 1
775
776 if pos_correct:
777 pos_correct_ctr += 1
778
779 if is_spliced:
780 correct_spliced_ctr += 1
781 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
782 correct_covered_splice_ctr += 1
783
784 if not is_spliced:
785 correct_unspliced_ctr += 1
786
787 else:
788 pos_incorrect_ctr += 1
789
790 if is_spliced:
791 incorrect_spliced_ctr += 1
792 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
793 incorrect_covered_splice_ctr += 1
794
795 if not is_spliced:
796 incorrect_unspliced_ctr += 1
797
798 if with_coverage and spliced_flag:
799 if not is_covered:
800 current_coverage_nr=0
801
802 if pos_correct:
803 new_line = "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
804 else:
805 new_line = "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
806
807 coverage_labels_fh.write(new_line+'\n')
808
809 total_ctr += 1
810
811 coverage_labels_fh.close()
812
813 numPredictions = len(allUniqPredictions)
814
815 result = []
816
817 # now that we have evaluated all instances put out all counters and sizes
818 result.append(('numPredictions',numPredictions))
819 result.append(('spliced_ctr',spliced_ctr))
820 result.append(('unspliced_ctr',unspliced_ctr))
821
822 result.append(('correct_spliced_ctr',correct_spliced_ctr))
823 result.append(('incorrect_spliced_ctr',incorrect_spliced_ctr))
824
825 result.append(('correct_unspliced_ctr',correct_unspliced_ctr))
826 result.append(('incorrect_unspliced_ctr',incorrect_unspliced_ctr))
827
828 result.append(('correct_covered_splice_ctr',correct_covered_splice_ctr))
829 result.append(('incorrect_covered_splice_ctr',incorrect_covered_splice_ctr))
830
831 result.append(('pos_correct_ctr',pos_correct_ctr))
832 result.append(('pos_incorrect_ctr',pos_incorrect_ctr))
833
834 result.append(('unspliced_spliced_reads_ctr',unspliced_spliced_reads_ctr))
835 result.append(('wrong_spliced_reads_ctr',wrong_spliced_reads_ctr))
836
837 result.append(('wrong_unspliced_reads_ctr',wrong_unspliced_reads_ctr))
838 result.append(('wrong_aligned_unspliced_reads_ctr',wrong_aligned_unspliced_reads_ctr))
839
840 result.append(('total_ctr',total_ctr))
841
842 result.append(('skipped_ctr',skipped_ctr))
843 result.append(('min_coverage',min_coverage))
844
845 return result
846
847
848
849 def print_result(result):
850 # now that we have evaluated all instances put out all counters and sizes
851 for name,ctr in result:
852 print name,ctr
853
854
855 def load_chunks(current_dir):
856 chunks_fn = []
857 for fn in os.listdir(current_dir):
858 if fn.startswith('chunk'):
859 chunks_fn.append(fn)
860
861 allPredictions = []
862
863 for c_fn in chunks_fn:
864 full_fn = os.path.join(current_dir,c_fn)
865 print full_fn
866 current_chunk = cPickle.load(open(full_fn))
867 allPredictions.extend(current_chunk)
868
869 return allPredictions
870
871
872 def predict_on_all_chunks(current_dir,training_keys_fn):
873 """
874 We load all chunks from the current_dir belonging to one run.
875 Then we load the saved keys of the training set to restore the training and
876 testing sets.
877 Once we have done that we separately evaluate both sets.
878
879 """
880
881 allPredictions = load_chunks(current_dir)
882
883 allPredictionsDict = {}
884 for elem in allPredictions:
885 id = elem['id']
886
887 if allPredictionsDict.has_key(id):
888 old_entry = allPredictionsDict[id]
889 old_entry.append(elem)
890 allPredictionsDict[id] = old_entry
891 else:
892 allPredictionsDict[id] = [elem]
893
894 training_keys = cPickle.load(open(training_keys_fn))
895
896 training_set = {}
897 for key in training_keys:
898 # we have the try construct because some of the reads used for training
899 # may not be found using vmatch at all
900 try:
901 training_set[key] = allPredictionsDict[key]
902 del allPredictionsDict[key]
903 except:
904 pass
905
906 test_set = allPredictionsDict
907
908 #test_set = {}
909 #for k in allPredictionsDict.keys()[:100]:
910 # test_set[k] = allPredictionsDict[k]
911 #result_train = predict_on(training_set,all_filtered_reads,False,coverage_fn)
912 #pdb.set_trace()
913
914 coverage_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/all_coverages'
915 all_filtered_reads = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
916 coverage_labels_fn = 'COVERAGE_LABELS'
917
918 result_test = predict_on(test_set,all_filtered_reads,True,coverage_fn,coverage_labels_fn)
919 #print_result(result_train)
920
921 return result_test
922
923
924 if __name__ == '__main__':
925 pass