+ minor changes in the paths
[qpalma.git] / scripts / Evaluation.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10
11 from qpalma.parsers import *
12
13
14 data = None
15
16
17 def result_statistic():
18 """
19
20 """
21
22 num_unaligned_reads = 0
23 num_incorrectly_aligned_reads = 0
24 pass
25
26
27 def createErrorVSCutPlot(results):
28 """
29 This function takes the results of the evaluation and creates a tex table.
30 """
31
32 fh = open('error_rates_table.tex','w+')
33 lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
34 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
35 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
36
37 #for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
38 for pos,key in enumerate(['+++']):
39 res = results[key]
40 for i in range(37):
41 ctr = 0
42 try:
43 ctr = res[1][i]
44 except:
45 ctr = 0
46
47 lines.append( '%d\n' % ctr)
48
49 if pos % 2 == 1:
50 lines.append('\hline')
51
52 lines.append('\end{tabular}')
53
54 lines = [l+'\n' for l in lines]
55 for l in lines:
56 fh.write(l)
57 fh.close()
58
59
60 def createTable(results):
61 """
62 This function takes the results of the evaluation and creates a tex table.
63 """
64
65 fh = open('result_table.tex','w+')
66 lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
67 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
68 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
69
70 for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
71 res = [e*100 for e in results[key]]
72
73 lines.append( '%s & %s & %s & %2.2f & %2.2f \\%%\\\\' % ( key[0], key[1], key[2], res[0], res[1] ) )
74 if pos % 2 == 1:
75 lines.append('\hline')
76
77 for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
78 res = [e*100 for e in results[key]]
79
80 lines.append( '%s & %s & %s & %2.2f & x \\%%\\\\' % ( key[0], key[1], key[2], res[2] ) )
81 if pos % 2 == 1:
82 lines.append('\hline')
83
84 lines.append('\end{tabular}')
85
86 lines = [l+'\n' for l in lines]
87 for l in lines:
88 fh.write(l)
89 fh.close()
90
91
92 def compare_scores_and_labels(scores,labels):
93 """
94 Iterate through all predictions. If we find a correct prediction check
95 whether this correct prediction scores higher than the incorrect
96 predictions for this example.
97 """
98
99 for currentPos,currentElem in enumerate(scores):
100 if labels[currentPos] == True:
101 for otherPos,otherElem in enumerate(scores):
102 if otherPos == currentPos:
103 continue
104
105 if labels[otherPos] == False and otherElem >= currentElem:
106 return False
107
108 return True
109
110
111 def compare_exons(predExons,trueExons):
112 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
113
114 if len(predExons) == 4:
115 e1_begin,e1_end = predExons[0],predExons[1]
116 e2_begin,e2_end = predExons[2],predExons[3]
117 else:
118 return False
119
120 e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
121 e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
122
123 e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
124 e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
125
126 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
127 and e2_e_off == 0:
128 return True
129
130 return False
131
132
133 def evaluate_unmapped_example(current_prediction):
134 predExons = current_prediction['predExons']
135 trueExons = current_prediction['trueExons']
136
137 result = compare_exons(predExons,trueExons)
138 return result
139
140
141 def evaluate_example(current_prediction):
142 label = False
143 label = current_prediction['label']
144
145 pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
146
147 # if the read was mapped by vmatch at an incorrect position we only have to
148 # compare the score
149 if label == False:
150 return label,False,pred_score
151
152 predExons = current_prediction['predExons']
153 trueExons = current_prediction['trueExons']
154
155 predPositions = [elem + current_prediction['alternative_start_pos'] for elem in predExons]
156 truePositions = [elem + current_prediction['start_pos'] for elem in trueExons.flatten().tolist()[0]]
157
158 pos_comparison = (predPositions == truePositions)
159
160 return label,pos_comparison,pred_score
161
162
163 def prediction_on(filename):
164
165 allPredictions = cPickle.load(open(filename))
166
167 gt_correct_ctr = 0
168 gt_incorrect_ctr = 0
169 incorrect_gt_cuts = {}
170
171 pos_correct_ctr = 0
172 pos_incorrect_ctr = 0
173 incorrect_vmatch_cuts = {}
174
175 score_correct_ctr = 0
176 score_incorrect_ctr = 0
177
178 total_gt_examples = 0
179 total_vmatch_instances_ctr = 0
180
181 true_vmatch_instances_ctr = 0
182
183 allUniquePredictions = [False]*len(allPredictions)
184
185 for pos,current_example_pred in enumerate(allPredictions):
186 for elem_nr,new_prediction in enumerate(current_example_pred[1:]):
187
188 if allUniquePredictions[pos] != False:
189 current_prediction = allUniquePredictions[pos]
190
191 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
192 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
193
194 if current_a_score < new_score :
195 allUniquePredictions[id] = new_prediction
196
197 else:
198 allUniquePredictions[pos] = new_prediction
199
200 for current_pred in allUniquePredictions:
201 if current_pred == False:
202 continue
203
204 #for current_example_pred in allPredictions:
205 #gt_example = current_example_pred[0]
206 #gt_score = gt_example['DPScores'].flatten().tolist()[0][0]
207 #gt_correct = evaluate_unmapped_example(gt_example)
208
209 #exampleIdx = gt_example['exampleIdx']
210
211 #cut_pos = gt_example['true_cut']
212
213 #if gt_correct:
214 # gt_correct_ctr += 1
215 #else:
216 # gt_incorrect_ctr += 1
217
218 # try:
219 # incorrect_gt_cuts[cut_pos] += 1
220 # except:
221 # incorrect_gt_cuts[cut_pos] = 1
222
223 #total_gt_examples += 1
224
225 #current_scores = []
226 #current_labels = []
227 #for elem_nr,current_pred in enumerate(current_example_pred[1:]):
228
229 current_label,comparison_result,current_score = evaluate_example(current_pred)
230
231 # if vmatch found the right read pos we check for right exons
232 # boundaries
233 #if current_label:
234 if comparison_result:
235 pos_correct_ctr += 1
236 else:
237 pos_incorrect_ctr += 1
238
239 #try:
240 # incorrect_vmatch_cuts[cut_pos] += 1
241 #except:
242 # incorrect_vmatch_cuts[cut_pos] = 1
243
244 true_vmatch_instances_ctr += 1
245
246 #current_scores.append(current_score)
247 #current_labels.append(current_label)
248
249 total_vmatch_instances_ctr += 1
250
251 # check whether the correct predictions score higher than the incorrect
252 # ones
253 #cmp_res = compare_scores_and_labels(current_scores,current_labels)
254 #if cmp_res:
255 # score_correct_ctr += 1
256 #else:
257 # score_incorrect_ctr += 1
258
259 # now that we have evaluated all instances put out all counters and sizes
260 print 'Total num. of examples: %d' % len(allPredictions)
261 print 'Number of correct ground truth examples: %d' % gt_correct_ctr
262 print 'Total num. of true vmatch instances %d' % true_vmatch_instances_ctr
263 print 'Correct pos: %d, incorrect pos: %d' % (pos_correct_ctr,pos_incorrect_ctr)
264 print 'Total num. of vmatch instances %d' % total_vmatch_instances_ctr
265 print 'Correct scores: %d, incorrect scores: %d' %\
266 (score_correct_ctr,score_incorrect_ctr)
267
268 pos_error = 1.0 * pos_incorrect_ctr / total_vmatch_instances_ctr
269 score_error = 1.0 * score_incorrect_ctr / total_vmatch_instances_ctr
270 gt_error = 1.0 * gt_incorrect_ctr / total_gt_examples
271
272 return (pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts)
273
274
275 def collect_prediction(current_dir,run_name):
276 """
277 Given the toplevel directory this function takes care that for each distinct
278 experiment the training and test predictions are evaluated.
279
280 """
281 idx = 5
282
283 train_suffix = '_%d_allPredictions_TRAIN' % (idx)
284 test_suffix = '_%d_allPredictions_TEST' % (idx)
285
286 jp = os.path.join
287 b2s = ['-','+']
288
289 currentRun = cPickle.load(open(jp(current_dir,'run_object_%d.pickle'%(idx))))
290 QFlag = currentRun['enable_quality_scores']
291 SSFlag = currentRun['enable_splice_signals']
292 ILFlag = currentRun['enable_intron_length']
293 currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
294
295 #filename = jp(current_dir,run_name)+train_suffix
296 #print 'Prediction on: %s' % filename
297 #train_result = prediction_on(filename)
298 train_result = []
299
300 filename = jp(current_dir,run_name)+test_suffix
301 print 'Prediction on: %s' % filename
302 test_result = prediction_on(filename)
303
304 return train_result,test_result,currentRunId
305
306
307 def perform_prediction(current_dir,run_name):
308 """
309 This function takes care of starting the jobs needed for the prediction phase
310 of qpalma
311 """
312
313 #for i in range(1,6):
314 for i in range(1,2):
315 cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s %d |\
316 qsub -l h_vmem=12.0G -cwd -j y -N \"%s_%d.log\"'%(current_dir,i,run_name,i)
317
318 #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
319 #print cmd
320 os.system(cmd)
321
322
323
324 def forall_experiments(current_func,tl_dir):
325 """
326 Given the toplevel directoy this function calls for each subdir the
327 function given as first argument. Which are at the moment:
328
329 - perform_prediction, and
330 - collect_prediction.
331
332 """
333
334 dir_entries = os.listdir(tl_dir)
335 dir_entries = [os.path.join(tl_dir,de) for de in dir_entries]
336 run_dirs = [de for de in dir_entries if os.path.isdir(de)]
337
338 all_results = {}
339 all_error_rates = {}
340
341 for current_dir in run_dirs:
342 run_name = current_dir.split('/')[-1]
343
344 pdb.set_trace()
345
346 if current_func.__name__ == 'perform_prediction':
347 current_func(current_dir,run_name)
348
349 if current_func.__name__ == 'collect_prediction':
350 train_result,test_result,currentRunId = current_func(current_dir,run_name)
351 all_results[currentRunId] = test_result
352 pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts = test_result
353 all_error_rates[currentRunId] = (incorrect_gt_cuts,incorrect_vmatch_cuts)
354
355 if current_func.__name__ == 'collect_prediction':
356 #createErrorVSCutPlot(all_error_rates)
357 createTable(all_results)
358
359 def _predict_on(filename,filtered_reads,with_coverage):
360 """
361 This function evaluates the predictions made by QPalma.
362 It needs a pickled file containing the predictions themselves and the
363 ascii file with original reads.
364
365 Optionally one can specifiy a coverage file containing for each read the
366 coverage number estimated by a remapping step.
367
368 """
369
370 coverage_map = {}
371
372 if with_coverage:
373 for line in open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/coverage_results/ALL_COVERAGES'):
374 id,coverage_nr = line.strip().split()
375 coverage_map[int(id)] = int(coverage_nr)
376
377 print 'parsing filtered reads..'
378 all_filtered_reads = parse_filtered_reads(filtered_reads)
379 print 'found %d filtered reads' % len(all_filtered_reads)
380
381 out_fh = open('predicted_positions.txt','w+')
382
383 allPredictions = cPickle.load(open(filename))
384
385 spliced_ctr = 0
386 unspliced_ctr = 0
387
388 pos_correct_ctr = 0
389 pos_incorrect_ctr = 0
390
391 correct_spliced_ctr = 0
392 correct_unspliced_ctr = 0
393
394 incorrect_spliced_ctr = 0
395 incorrect_unspliced_ctr = 0
396
397 correct_covered_splice_ctr = 0
398 incorrect_covered_splice_ctr = 0
399
400 total_vmatch_instances_ctr = 0
401
402 unspliced_spliced_reads_ctr = 0
403 wrong_spliced_reads_ctr = 0
404
405 wrong_aligned_unspliced_reads_ctr = 0
406 wrong_unspliced_reads_ctr = 0
407
408 cut_pos_ctr = {}
409
410 total_ctr = 0
411 skipped_ctr = 0
412
413 is_spliced = False
414 min_coverage = 3
415
416 allUniqPredictions = {}
417
418 print 'Got %d predictions' % len(allPredictions)
419
420 for new_prediction in allPredictions:
421 id = new_prediction['id']
422 id = int(id)
423
424 if allUniqPredictions.has_key(id):
425 current_prediction = allUniqPredictions[id]
426
427 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
428 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
429
430 if current_a_score < new_score :
431 allUniqPredictions[id] = new_prediction
432
433 else:
434 allUniqPredictions[id] = new_prediction
435
436 print 'Got %d uniq predictions' % len(allUniqPredictions)
437
438 #for current_prediction in allPredictions:
439 for _id,current_prediction in allUniqPredictions.items():
440 id = current_prediction['id']
441 id = int(id)
442
443 if not id >= 1000000300000:
444 is_spliced = True
445 else:
446 is_spliced = False
447
448 is_covered = False
449
450 if is_spliced and with_coverage:
451 try:
452 current_coverage_nr = coverage_map[id]
453 is_covered = True
454 except:
455 is_covered = False
456
457
458 if is_spliced:
459 spliced_ctr += 1
460 else:
461 unspliced_ctr += 1
462
463 try:
464 current_ground_truth = all_filtered_reads[id]
465 except:
466 skipped_ctr += 1
467 continue
468
469 start_pos = current_prediction['start_pos']
470 chr = current_prediction['chr']
471 strand = current_prediction['strand']
472
473 #score = current_prediction['DPScores'].flatten().tolist()[0][0]
474 #pdb.set_trace()
475
476 predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
477 predExons = [e+start_pos for e in predExons]
478
479 spliced_flag = False
480
481 if len(predExons) == 4:
482 spliced_flag = True
483 predExons[1] -= 1
484 predExons[3] -= 1
485
486 if predExons[0] == 19504568:
487 pdb.set_trace()
488
489 cut_pos = current_ground_truth['true_cut']
490 p_start = current_ground_truth['p_start']
491 e_stop = current_ground_truth['exon_stop']
492 e_start = current_ground_truth['exon_start']
493 p_stop = current_ground_truth['p_stop']
494
495 true_cut = current_ground_truth['true_cut']
496
497 if p_start == predExons[0] and e_stop == predExons[1] and\
498 e_start == predExons[2] and p_stop == predExons[3]:
499 pos_correct = True
500 else:
501 pos_correct = False
502
503 elif len(predExons) == 2:
504 spliced_flag = False
505 predExons[1] -= 1
506
507 cut_pos = current_ground_truth['true_cut']
508 p_start = current_ground_truth['p_start']
509 p_stop = current_ground_truth['p_stop']
510
511 true_cut = current_ground_truth['true_cut']
512
513 if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
514 pos_correct = True
515 else:
516 pos_correct = False
517
518 else:
519 pos_correct = False
520
521 if is_spliced and not spliced_flag:
522 unspliced_spliced_reads_ctr += 1
523
524 if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
525 wrong_spliced_reads_ctr += 1
526
527 if not is_spliced and spliced_flag:
528 wrong_unspliced_reads_ctr += 1
529
530 if not is_spliced and not pos_correct:
531 wrong_aligned_unspliced_reads_ctr += 1
532
533 if pos_correct:
534 pos_correct_ctr += 1
535
536 if is_spliced:
537 correct_spliced_ctr += 1
538 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
539 correct_covered_splice_ctr += 1
540
541 if not is_spliced:
542 correct_unspliced_ctr += 1
543
544 else:
545 pos_incorrect_ctr += 1
546
547 if is_spliced:
548 incorrect_spliced_ctr += 1
549 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
550 incorrect_covered_splice_ctr += 1
551
552 if not is_spliced:
553 incorrect_unspliced_ctr += 1
554
555 if with_coverage and spliced_flag:
556 if not is_covered:
557 current_coverage_nr=0
558 if pos_correct:
559 print "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
560 else:
561 print "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
562
563 total_ctr += 1
564
565
566 numPredictions = len(allUniqPredictions)
567
568 # now that we have evaluated all instances put out all counters and sizes
569 print 'Total num. of examples: %d' % numPredictions
570
571 print "spliced/unspliced: %d,%d " % (spliced_ctr, unspliced_ctr )
572 print "Correct/incorrect spliced: %d,%d " % (correct_spliced_ctr, incorrect_spliced_ctr )
573 print "Correct/incorrect unspliced: %d,%d " % (correct_unspliced_ctr , incorrect_unspliced_ctr )
574 print "Correct/incorrect covered spliced read: %d,%d " %\
575 (correct_covered_splice_ctr,incorrect_covered_splice_ctr)
576
577 print "pos_correct: %d,%d" % (pos_correct_ctr , pos_incorrect_ctr )
578
579 print 'unspliced_spliced reads: %d' % unspliced_spliced_reads_ctr
580 print 'spliced reads at wrong_place: %d' % wrong_spliced_reads_ctr
581
582 print 'spliced_unspliced reads: %d' % wrong_unspliced_reads_ctr
583 print 'wrong aligned at wrong_pos: %d' % wrong_aligned_unspliced_reads_ctr
584
585 print 'total_ctr: %d' % total_ctr
586
587 print "skipped: %d " % skipped_ctr
588 print 'min. coverage: %d' % min_coverage
589
590 result_dict = {}
591 result_dict['skipped_ctr'] = skipped_ctr
592 result_dict['min_coverage'] = min_coverage
593
594 return result_dict
595
596
597
598
599
600 def predict_on(allPredictions,all_filtered_reads,all_labels_fn,with_coverage,coverage_fn,coverage_labels_fn):
601 """
602 This function evaluates the predictions made by QPalma.
603 It needs a pickled file containing the predictions themselves and the
604 ascii file with original reads.
605
606 Optionally one can specifiy a coverage file containing for each read the
607 coverage number estimated by a remapping step.
608
609
610 """
611
612 coverage_labels_fh = open(coverage_labels_fn,'w+')
613
614 all_labels_fh = open(all_labels_fn,'w+')
615
616 import qparser
617 qparser.parse_reads(all_filtered_reads)
618
619 coverage_map = {}
620
621
622 if with_coverage:
623 for line in open(coverage_fn):
624 id,coverage_nr = line.strip().split()
625 coverage_map[int(id)] = int(coverage_nr)
626
627 #out_fh = open('predicted_positions.txt','w+')
628
629 spliced_ctr = 0
630 unspliced_ctr = 0
631
632 pos_correct_ctr = 0
633 pos_incorrect_ctr = 0
634
635 correct_spliced_ctr = 0
636 correct_unspliced_ctr = 0
637
638 incorrect_spliced_ctr = 0
639 incorrect_unspliced_ctr = 0
640
641 correct_covered_splice_ctr = 0
642 incorrect_covered_splice_ctr = 0
643
644 total_vmatch_instances_ctr = 0
645
646 unspliced_spliced_reads_ctr = 0
647 wrong_spliced_reads_ctr = 0
648
649 wrong_aligned_unspliced_reads_ctr = 0
650 wrong_unspliced_reads_ctr = 0
651
652 cut_pos_ctr = {}
653
654 total_ctr = 0
655 skipped_ctr = 0
656
657 is_spliced = False
658 min_coverage = 3
659
660 allUniqPredictions = {}
661
662 print 'Got %d predictions' % len(allPredictions)
663
664 for k,predictions in allPredictions.items():
665 for new_prediction in predictions:
666 id = new_prediction['id']
667 id = int(id)
668
669 if allUniqPredictions.has_key(id):
670 current_prediction = allUniqPredictions[id]
671
672 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
673 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
674
675 if current_a_score < new_score :
676 allUniqPredictions[id] = new_prediction
677
678 else:
679 allUniqPredictions[id] = new_prediction
680
681 print 'Got %d uniq predictions' % len(allUniqPredictions)
682
683 for _id,current_prediction in allUniqPredictions.items():
684 id = current_prediction['id']
685 id = int(id)
686
687 if not id >= 1000000300000:
688 is_spliced = True
689 else:
690 is_spliced = False
691
692 is_covered = False
693
694 if is_spliced and with_coverage:
695 try:
696 current_coverage_nr = coverage_map[id]
697 is_covered = True
698 except:
699 is_covered = False
700
701
702 if is_spliced:
703 spliced_ctr += 1
704 else:
705 unspliced_ctr += 1
706
707 try:
708 #current_ground_truth = all_filtered_reads[id]
709 current_ground_truth = qparser.fetch_read(id)
710 except:
711 skipped_ctr += 1
712 continue
713
714 start_pos = current_prediction['start_pos']
715 chr = current_prediction['chr']
716 strand = current_prediction['strand']
717
718 #score = current_prediction['DPScores'].flatten().tolist()[0][0]
719 #pdb.set_trace()
720
721 predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
722 predExons = [e+start_pos for e in predExons]
723
724 spliced_flag = False
725
726 if len(predExons) == 4:
727 spliced_flag = True
728 predExons[1] -= 1
729 predExons[3] -= 1
730
731 cut_pos = current_ground_truth['true_cut']
732 p_start = current_ground_truth['p_start']
733 e_stop = current_ground_truth['exon_stop']
734 e_start = current_ground_truth['exon_start']
735 p_stop = current_ground_truth['p_stop']
736
737 true_cut = current_ground_truth['true_cut']
738
739 if p_start == predExons[0] and e_stop == predExons[1] and\
740 e_start == predExons[2] and p_stop == predExons[3]:
741 pos_correct = True
742 else:
743 pos_correct = False
744
745 elif len(predExons) == 2:
746 spliced_flag = False
747 predExons[1] -= 1
748
749 cut_pos = current_ground_truth['true_cut']
750 p_start = current_ground_truth['p_start']
751 p_stop = current_ground_truth['p_stop']
752
753 true_cut = current_ground_truth['true_cut']
754
755 if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
756 pos_correct = True
757 else:
758 pos_correct = False
759
760 else:
761 pos_correct = False
762
763 if is_spliced and not spliced_flag:
764 unspliced_spliced_reads_ctr += 1
765
766 if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
767 wrong_spliced_reads_ctr += 1
768
769 if not is_spliced and spliced_flag:
770 wrong_unspliced_reads_ctr += 1
771
772 if not is_spliced and not pos_correct:
773 wrong_aligned_unspliced_reads_ctr += 1
774
775 if pos_correct:
776 pos_correct_ctr += 1
777
778 if is_spliced:
779 correct_spliced_ctr += 1
780 all_labels_fh.write('%d correct\n'%id)
781 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
782 correct_covered_splice_ctr += 1
783
784 if not is_spliced:
785 correct_unspliced_ctr += 1
786
787 else:
788 pos_incorrect_ctr += 1
789
790 if is_spliced:
791 incorrect_spliced_ctr += 1
792 all_labels_fh.write('%d wrong\n'%id)
793 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
794 incorrect_covered_splice_ctr += 1
795
796 if not is_spliced:
797 incorrect_unspliced_ctr += 1
798
799 if with_coverage:
800 if not is_covered:
801 current_coverage_nr=0
802
803 if pos_correct:
804 new_line = "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
805 else:
806 new_line = "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
807
808 coverage_labels_fh.write(new_line+'\n')
809
810 total_ctr += 1
811
812 coverage_labels_fh.close()
813
814 numPredictions = len(allUniqPredictions)
815
816 result = []
817
818 # now that we have evaluated all instances put out all counters and sizes
819 result.append(('numPredictions',numPredictions))
820 result.append(('spliced_ctr',spliced_ctr))
821 result.append(('unspliced_ctr',unspliced_ctr))
822
823 result.append(('correct_spliced_ctr',correct_spliced_ctr))
824 result.append(('incorrect_spliced_ctr',incorrect_spliced_ctr))
825
826 result.append(('correct_unspliced_ctr',correct_unspliced_ctr))
827 result.append(('incorrect_unspliced_ctr',incorrect_unspliced_ctr))
828
829 result.append(('correct_covered_splice_ctr',correct_covered_splice_ctr))
830 result.append(('incorrect_covered_splice_ctr',incorrect_covered_splice_ctr))
831
832 result.append(('pos_correct_ctr',pos_correct_ctr))
833 result.append(('pos_incorrect_ctr',pos_incorrect_ctr))
834
835 result.append(('unspliced_spliced_reads_ctr',unspliced_spliced_reads_ctr))
836 result.append(('wrong_spliced_reads_ctr',wrong_spliced_reads_ctr))
837
838 result.append(('wrong_unspliced_reads_ctr',wrong_unspliced_reads_ctr))
839 result.append(('wrong_aligned_unspliced_reads_ctr',wrong_aligned_unspliced_reads_ctr))
840
841 result.append(('total_ctr',total_ctr))
842
843 result.append(('skipped_ctr',skipped_ctr))
844 result.append(('min_coverage',min_coverage))
845
846 return result
847
848
849
850 def print_result(result):
851 # now that we have evaluated all instances put out all counters and sizes
852 for name,ctr in result:
853 print name,ctr
854
855
856 def load_chunks(current_dir):
857 chunks_fn = []
858 for fn in os.listdir(current_dir):
859 if fn.startswith('chunk'):
860 chunks_fn.append(fn)
861
862 allPredictions = []
863
864 for c_fn in chunks_fn:
865 full_fn = os.path.join(current_dir,c_fn)
866 print full_fn
867 current_chunk = cPickle.load(open(full_fn))
868 allPredictions.extend(current_chunk)
869
870 return allPredictions
871
872
873 def predict_on_all_chunks(current_dir,training_keys_fn):
874 """
875 We load all chunks from the current_dir belonging to one run.
876 Then we load the saved keys of the training set to restore the training and
877 testing sets.
878 Once we have done that we separately evaluate both sets.
879
880 """
881
882 allPredictions = load_chunks(current_dir)
883
884 allPredictionsDict = {}
885 for elem in allPredictions:
886 id = elem['id']
887
888 if allPredictionsDict.has_key(id):
889 old_entry = allPredictionsDict[id]
890 old_entry.append(elem)
891 allPredictionsDict[id] = old_entry
892 else:
893 allPredictionsDict[id] = [elem]
894
895 training_keys = cPickle.load(open(training_keys_fn))
896
897 training_set = {}
898 for key in training_keys:
899 # we have the try construct because some of the reads used for training
900 # may not be found using vmatch at all
901 try:
902 training_set[key] = allPredictionsDict[key]
903 del allPredictionsDict[key]
904 except:
905 pass
906
907 test_set = allPredictionsDict
908
909 #test_set = {}
910 #for k in allPredictionsDict.keys()[:100]:
911 # test_set[k] = allPredictionsDict[k]
912 #result_train = predict_on(training_set,all_filtered_reads,False,coverage_fn)
913 #pdb.set_trace()
914
915 # this is the heuristic.parsed_spliced_reads.txt file from the vmatch remapping step
916 coverage_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/all_coverages'
917
918 all_filtered_reads = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
919
920 coverage_labels_fn = 'COVERAGE_LABELS'
921
922 result_test = predict_on(test_set,all_filtered_reads,'all_prediction_labels.txt',True,coverage_fn,coverage_labels_fn)
923 #print_result(result_train)
924
925 return result_test
926
927
928 if __name__ == '__main__':
929 pass