952eb4c34df26bf4e67d81fa5c98b6c0963df09d
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
11 from qpalma.parsers import *
14 data = None
17 def result_statistic():
18 """
20 """
24 pass
27 def createErrorVSCutPlot(results):
28 """
29 This function takes the results of the evaluation and creates a tex table.
30 """
32 fh = open('error_rates_table.tex','w+')
33 lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
34 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
35 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
37 #for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
38 for pos,key in enumerate(['+++']):
39 res = results[key]
40 for i in range(37):
41 ctr = 0
42 try:
43 ctr = res[1][i]
44 except:
45 ctr = 0
47 lines.append( '%d\n' % ctr)
49 if pos % 2 == 1:
50 lines.append('\hline')
52 lines.append('\end{tabular}')
54 lines = [l+'\n' for l in lines]
55 for l in lines:
56 fh.write(l)
57 fh.close()
60 def createTable(results):
61 """
62 This function takes the results of the evaluation and creates a tex table.
63 """
65 fh = open('result_table.tex','w+')
66 lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
67 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
68 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
70 for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
71 res = [e*100 for e in results[key]]
73 lines.append( '%s & %s & %s & %2.2f & %2.2f \\%%\\\\' % ( key[0], key[1], key[2], res[0], res[1] ) )
74 if pos % 2 == 1:
75 lines.append('\hline')
77 for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
78 res = [e*100 for e in results[key]]
80 lines.append( '%s & %s & %s & %2.2f & x \\%%\\\\' % ( key[0], key[1], key[2], res[2] ) )
81 if pos % 2 == 1:
82 lines.append('\hline')
84 lines.append('\end{tabular}')
86 lines = [l+'\n' for l in lines]
87 for l in lines:
88 fh.write(l)
89 fh.close()
92 def compare_scores_and_labels(scores,labels):
93 """
94 Iterate through all predictions. If we find a correct prediction check
95 whether this correct prediction scores higher than the incorrect
96 predictions for this example.
97 """
99 for currentPos,currentElem in enumerate(scores):
100 if labels[currentPos] == True:
101 for otherPos,otherElem in enumerate(scores):
102 if otherPos == currentPos:
103 continue
105 if labels[otherPos] == False and otherElem >= currentElem:
106 return False
108 return True
111 def compare_exons(predExons,trueExons):
112 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
114 if len(predExons) == 4:
115 e1_begin,e1_end = predExons[0],predExons[1]
116 e2_begin,e2_end = predExons[2],predExons[3]
117 else:
118 return False
120 e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
121 e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
123 e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
124 e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
126 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
127 and e2_e_off == 0:
128 return True
130 return False
133 def evaluate_unmapped_example(current_prediction):
134 predExons = current_prediction['predExons']
135 trueExons = current_prediction['trueExons']
137 result = compare_exons(predExons,trueExons)
138 return result
141 def evaluate_example(current_prediction):
142 label = False
143 label = current_prediction['label']
145 pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
147 # if the read was mapped by vmatch at an incorrect position we only have to
148 # compare the score
149 if label == False:
150 return label,False,pred_score
152 predExons = current_prediction['predExons']
153 trueExons = current_prediction['trueExons']
155 predPositions = [elem + current_prediction['alternative_start_pos'] for elem in predExons]
156 truePositions = [elem + current_prediction['start_pos'] for elem in trueExons.flatten().tolist()[0]]
158 pos_comparison = (predPositions == truePositions)
160 return label,pos_comparison,pred_score
163 def prediction_on(filename):
167 gt_correct_ctr = 0
168 gt_incorrect_ctr = 0
169 incorrect_gt_cuts = {}
171 pos_correct_ctr = 0
172 pos_incorrect_ctr = 0
173 incorrect_vmatch_cuts = {}
175 score_correct_ctr = 0
176 score_incorrect_ctr = 0
178 total_gt_examples = 0
179 total_vmatch_instances_ctr = 0
181 true_vmatch_instances_ctr = 0
183 allUniquePredictions = [False]*len(allPredictions)
185 for pos,current_example_pred in enumerate(allPredictions):
186 for elem_nr,new_prediction in enumerate(current_example_pred[1:]):
188 if allUniquePredictions[pos] != False:
189 current_prediction = allUniquePredictions[pos]
191 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
192 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
194 if current_a_score < new_score :
195 allUniquePredictions[id] = new_prediction
197 else:
198 allUniquePredictions[pos] = new_prediction
200 for current_pred in allUniquePredictions:
201 if current_pred == False:
202 continue
204 #for current_example_pred in allPredictions:
205 #gt_example = current_example_pred[0]
206 #gt_score = gt_example['DPScores'].flatten().tolist()[0][0]
207 #gt_correct = evaluate_unmapped_example(gt_example)
209 #exampleIdx = gt_example['exampleIdx']
211 #cut_pos = gt_example['true_cut']
213 #if gt_correct:
214 # gt_correct_ctr += 1
215 #else:
216 # gt_incorrect_ctr += 1
218 # try:
219 # incorrect_gt_cuts[cut_pos] += 1
220 # except:
221 # incorrect_gt_cuts[cut_pos] = 1
223 #total_gt_examples += 1
225 #current_scores = []
226 #current_labels = []
227 #for elem_nr,current_pred in enumerate(current_example_pred[1:]):
229 current_label,comparison_result,current_score = evaluate_example(current_pred)
231 # if vmatch found the right read pos we check for right exons
232 # boundaries
233 #if current_label:
234 if comparison_result:
235 pos_correct_ctr += 1
236 else:
237 pos_incorrect_ctr += 1
239 #try:
240 # incorrect_vmatch_cuts[cut_pos] += 1
241 #except:
242 # incorrect_vmatch_cuts[cut_pos] = 1
244 true_vmatch_instances_ctr += 1
246 #current_scores.append(current_score)
247 #current_labels.append(current_label)
249 total_vmatch_instances_ctr += 1
251 # check whether the correct predictions score higher than the incorrect
252 # ones
253 #cmp_res = compare_scores_and_labels(current_scores,current_labels)
254 #if cmp_res:
255 # score_correct_ctr += 1
256 #else:
257 # score_incorrect_ctr += 1
259 # now that we have evaluated all instances put out all counters and sizes
260 print 'Total num. of examples: %d' % len(allPredictions)
261 print 'Number of correct ground truth examples: %d' % gt_correct_ctr
262 print 'Total num. of true vmatch instances %d' % true_vmatch_instances_ctr
263 print 'Correct pos: %d, incorrect pos: %d' % (pos_correct_ctr,pos_incorrect_ctr)
264 print 'Total num. of vmatch instances %d' % total_vmatch_instances_ctr
265 print 'Correct scores: %d, incorrect scores: %d' %\
266 (score_correct_ctr,score_incorrect_ctr)
268 pos_error = 1.0 * pos_incorrect_ctr / total_vmatch_instances_ctr
269 score_error = 1.0 * score_incorrect_ctr / total_vmatch_instances_ctr
270 gt_error = 1.0 * gt_incorrect_ctr / total_gt_examples
272 return (pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts)
275 def collect_prediction(current_dir,run_name):
276 """
277 Given the toplevel directory this function takes care that for each distinct
278 experiment the training and test predictions are evaluated.
280 """
281 idx = 5
283 train_suffix = '_%d_allPredictions_TRAIN' % (idx)
284 test_suffix = '_%d_allPredictions_TEST' % (idx)
286 jp = os.path.join
287 b2s = ['-','+']
290 QFlag = currentRun['enable_quality_scores']
291 SSFlag = currentRun['enable_splice_signals']
292 ILFlag = currentRun['enable_intron_length']
293 currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
295 #filename = jp(current_dir,run_name)+train_suffix
296 #print 'Prediction on: %s' % filename
297 #train_result = prediction_on(filename)
298 train_result = []
300 filename = jp(current_dir,run_name)+test_suffix
301 print 'Prediction on: %s' % filename
302 test_result = prediction_on(filename)
304 return train_result,test_result,currentRunId
307 def perform_prediction(current_dir,run_name):
308 """
309 This function takes care of starting the jobs needed for the prediction phase
310 of qpalma
311 """
313 #for i in range(1,6):
314 for i in range(1,2):
315 cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s %d |\
316 qsub -l h_vmem=12.0G -cwd -j y -N \"%s_%d.log\"'%(current_dir,i,run_name,i)
318 #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
319 #print cmd
320 os.system(cmd)
324 def forall_experiments(current_func,tl_dir):
325 """
326 Given the toplevel directoy this function calls for each subdir the
327 function given as first argument. Which are at the moment:
329 - perform_prediction, and
330 - collect_prediction.
332 """
334 dir_entries = os.listdir(tl_dir)
335 dir_entries = [os.path.join(tl_dir,de) for de in dir_entries]
336 run_dirs = [de for de in dir_entries if os.path.isdir(de)]
338 all_results = {}
339 all_error_rates = {}
341 for current_dir in run_dirs:
342 run_name = current_dir.split('/')[-1]
344 pdb.set_trace()
346 if current_func.__name__ == 'perform_prediction':
347 current_func(current_dir,run_name)
349 if current_func.__name__ == 'collect_prediction':
350 train_result,test_result,currentRunId = current_func(current_dir,run_name)
351 all_results[currentRunId] = test_result
352 pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts = test_result
353 all_error_rates[currentRunId] = (incorrect_gt_cuts,incorrect_vmatch_cuts)
355 if current_func.__name__ == 'collect_prediction':
356 #createErrorVSCutPlot(all_error_rates)
357 createTable(all_results)
360 """
361 This function evaluates the predictions made by QPalma.
362 It needs a pickled file containing the predictions themselves and the
363 ascii file with original reads.
365 Optionally one can specifiy a coverage file containing for each read the
366 coverage number estimated by a remapping step.
368 """
370 coverage_map = {}
372 if with_coverage:
373 for line in open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/coverage_results/ALL_COVERAGES'):
374 id,coverage_nr = line.strip().split()
375 coverage_map[int(id)] = int(coverage_nr)
381 out_fh = open('predicted_positions.txt','w+')
385 spliced_ctr = 0
386 unspliced_ctr = 0
388 pos_correct_ctr = 0
389 pos_incorrect_ctr = 0
391 correct_spliced_ctr = 0
392 correct_unspliced_ctr = 0
394 incorrect_spliced_ctr = 0
395 incorrect_unspliced_ctr = 0
397 correct_covered_splice_ctr = 0
398 incorrect_covered_splice_ctr = 0
400 total_vmatch_instances_ctr = 0
408 cut_pos_ctr = {}
410 total_ctr = 0
411 skipped_ctr = 0
413 is_spliced = False
414 min_coverage = 3
416 allUniqPredictions = {}
418 print 'Got %d predictions' % len(allPredictions)
420 for new_prediction in allPredictions:
421 id = new_prediction['id']
422 id = int(id)
424 if allUniqPredictions.has_key(id):
425 current_prediction = allUniqPredictions[id]
427 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
428 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
430 if current_a_score < new_score :
431 allUniqPredictions[id] = new_prediction
433 else:
434 allUniqPredictions[id] = new_prediction
436 print 'Got %d uniq predictions' % len(allUniqPredictions)
438 #for current_prediction in allPredictions:
439 for _id,current_prediction in allUniqPredictions.items():
440 id = current_prediction['id']
441 id = int(id)
443 if not id >= 1000000300000:
444 is_spliced = True
445 else:
446 is_spliced = False
448 is_covered = False
450 if is_spliced and with_coverage:
451 try:
452 current_coverage_nr = coverage_map[id]
453 is_covered = True
454 except:
455 is_covered = False
458 if is_spliced:
459 spliced_ctr += 1
460 else:
461 unspliced_ctr += 1
463 try:
465 except:
466 skipped_ctr += 1
467 continue
469 start_pos = current_prediction['start_pos']
470 chr = current_prediction['chr']
471 strand = current_prediction['strand']
473 #score = current_prediction['DPScores'].flatten().tolist()[0][0]
474 #pdb.set_trace()
476 predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
477 predExons = [e+start_pos for e in predExons]
479 spliced_flag = False
481 if len(predExons) == 4:
482 spliced_flag = True
483 predExons[1] -= 1
484 predExons[3] -= 1
486 if predExons[0] == 19504568:
487 pdb.set_trace()
489 cut_pos = current_ground_truth['true_cut']
490 p_start = current_ground_truth['p_start']
491 e_stop = current_ground_truth['exon_stop']
492 e_start = current_ground_truth['exon_start']
493 p_stop = current_ground_truth['p_stop']
495 true_cut = current_ground_truth['true_cut']
497 if p_start == predExons[0] and e_stop == predExons[1] and\
498 e_start == predExons[2] and p_stop == predExons[3]:
499 pos_correct = True
500 else:
501 pos_correct = False
503 elif len(predExons) == 2:
504 spliced_flag = False
505 predExons[1] -= 1
507 cut_pos = current_ground_truth['true_cut']
508 p_start = current_ground_truth['p_start']
509 p_stop = current_ground_truth['p_stop']
511 true_cut = current_ground_truth['true_cut']
513 if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
514 pos_correct = True
515 else:
516 pos_correct = False
518 else:
519 pos_correct = False
521 if is_spliced and not spliced_flag:
524 if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
527 if not is_spliced and spliced_flag:
530 if not is_spliced and not pos_correct:
533 if pos_correct:
534 pos_correct_ctr += 1
536 if is_spliced:
537 correct_spliced_ctr += 1
538 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
539 correct_covered_splice_ctr += 1
541 if not is_spliced:
542 correct_unspliced_ctr += 1
544 else:
545 pos_incorrect_ctr += 1
547 if is_spliced:
548 incorrect_spliced_ctr += 1
549 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
550 incorrect_covered_splice_ctr += 1
552 if not is_spliced:
553 incorrect_unspliced_ctr += 1
555 if with_coverage and spliced_flag:
556 if not is_covered:
557 current_coverage_nr=0
558 if pos_correct:
559 print "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
560 else:
561 print "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
563 total_ctr += 1
566 numPredictions = len(allUniqPredictions)
568 # now that we have evaluated all instances put out all counters and sizes
569 print 'Total num. of examples: %d' % numPredictions
571 print "spliced/unspliced: %d,%d " % (spliced_ctr, unspliced_ctr )
572 print "Correct/incorrect spliced: %d,%d " % (correct_spliced_ctr, incorrect_spliced_ctr )
573 print "Correct/incorrect unspliced: %d,%d " % (correct_unspliced_ctr , incorrect_unspliced_ctr )
574 print "Correct/incorrect covered spliced read: %d,%d " %\
575 (correct_covered_splice_ctr,incorrect_covered_splice_ctr)
577 print "pos_correct: %d,%d" % (pos_correct_ctr , pos_incorrect_ctr )
583 print 'wrong aligned at wrong_pos: %d' % wrong_aligned_unspliced_reads_ctr
585 print 'total_ctr: %d' % total_ctr
587 print "skipped: %d " % skipped_ctr
588 print 'min. coverage: %d' % min_coverage
590 result_dict = {}
591 result_dict['skipped_ctr'] = skipped_ctr
592 result_dict['min_coverage'] = min_coverage
594 return result_dict
601 """
602 This function evaluates the predictions made by QPalma.
603 It needs a pickled file containing the predictions themselves and the
604 ascii file with original reads.
606 Optionally one can specifiy a coverage file containing for each read the
607 coverage number estimated by a remapping step.
610 """
612 coverage_labels_fh = open(coverage_labels_fn,'w+')
614 all_labels_fh = open(all_labels_fn,'w+')
616 import qparser
619 coverage_map = {}
622 if with_coverage:
623 for line in open(coverage_fn):
624 id,coverage_nr = line.strip().split()
625 coverage_map[int(id)] = int(coverage_nr)
627 #out_fh = open('predicted_positions.txt','w+')
629 spliced_ctr = 0
630 unspliced_ctr = 0
632 pos_correct_ctr = 0
633 pos_incorrect_ctr = 0
635 correct_spliced_ctr = 0
636 correct_unspliced_ctr = 0
638 incorrect_spliced_ctr = 0
639 incorrect_unspliced_ctr = 0
641 correct_covered_splice_ctr = 0
642 incorrect_covered_splice_ctr = 0
644 total_vmatch_instances_ctr = 0
652 cut_pos_ctr = {}
654 total_ctr = 0
655 skipped_ctr = 0
657 is_spliced = False
658 min_coverage = 3
660 allUniqPredictions = {}
662 print 'Got %d predictions' % len(allPredictions)
664 for k,predictions in allPredictions.items():
665 for new_prediction in predictions:
666 id = new_prediction['id']
667 id = int(id)
669 if allUniqPredictions.has_key(id):
670 current_prediction = allUniqPredictions[id]
672 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
673 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
675 if current_a_score < new_score :
676 allUniqPredictions[id] = new_prediction
678 else:
679 allUniqPredictions[id] = new_prediction
681 print 'Got %d uniq predictions' % len(allUniqPredictions)
683 for _id,current_prediction in allUniqPredictions.items():
684 id = current_prediction['id']
685 id = int(id)
687 if not id >= 1000000300000:
688 is_spliced = True
689 else:
690 is_spliced = False
692 is_covered = False
694 if is_spliced and with_coverage:
695 try:
696 current_coverage_nr = coverage_map[id]
697 is_covered = True
698 except:
699 is_covered = False
702 if is_spliced:
703 spliced_ctr += 1
704 else:
705 unspliced_ctr += 1
707 try:
710 except:
711 skipped_ctr += 1
712 continue
714 start_pos = current_prediction['start_pos']
715 chr = current_prediction['chr']
716 strand = current_prediction['strand']
718 #score = current_prediction['DPScores'].flatten().tolist()[0][0]
719 #pdb.set_trace()
721 predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
722 predExons = [e+start_pos for e in predExons]
724 spliced_flag = False
726 if len(predExons) == 4:
727 spliced_flag = True
728 predExons[1] -= 1
729 predExons[3] -= 1
731 cut_pos = current_ground_truth['true_cut']
732 p_start = current_ground_truth['p_start']
733 e_stop = current_ground_truth['exon_stop']
734 e_start = current_ground_truth['exon_start']
735 p_stop = current_ground_truth['p_stop']
737 true_cut = current_ground_truth['true_cut']
739 if p_start == predExons[0] and e_stop == predExons[1] and\
740 e_start == predExons[2] and p_stop == predExons[3]:
741 pos_correct = True
742 else:
743 pos_correct = False
745 elif len(predExons) == 2:
746 spliced_flag = False
747 predExons[1] -= 1
749 cut_pos = current_ground_truth['true_cut']
750 p_start = current_ground_truth['p_start']
751 p_stop = current_ground_truth['p_stop']
753 true_cut = current_ground_truth['true_cut']
755 if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
756 pos_correct = True
757 else:
758 pos_correct = False
760 else:
761 pos_correct = False
763 if is_spliced and not spliced_flag:
766 if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
769 if not is_spliced and spliced_flag:
772 if not is_spliced and not pos_correct:
775 if pos_correct:
776 pos_correct_ctr += 1
778 if is_spliced:
779 correct_spliced_ctr += 1
780 all_labels_fh.write('%d correct\n'%id)
781 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
782 correct_covered_splice_ctr += 1
784 if not is_spliced:
785 correct_unspliced_ctr += 1
787 else:
788 pos_incorrect_ctr += 1
790 if is_spliced:
791 incorrect_spliced_ctr += 1
792 all_labels_fh.write('%d wrong\n'%id)
793 if with_coverage and is_covered and current_coverage_nr >= min_coverage:
794 incorrect_covered_splice_ctr += 1
796 if not is_spliced:
797 incorrect_unspliced_ctr += 1
799 if with_coverage:
800 if not is_covered:
801 current_coverage_nr=0
803 if pos_correct:
804 new_line = "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
805 else:
806 new_line = "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
808 coverage_labels_fh.write(new_line+'\n')
810 total_ctr += 1
812 coverage_labels_fh.close()
814 numPredictions = len(allUniqPredictions)
816 result = []
818 # now that we have evaluated all instances put out all counters and sizes
819 result.append(('numPredictions',numPredictions))
820 result.append(('spliced_ctr',spliced_ctr))
821 result.append(('unspliced_ctr',unspliced_ctr))
823 result.append(('correct_spliced_ctr',correct_spliced_ctr))
824 result.append(('incorrect_spliced_ctr',incorrect_spliced_ctr))
826 result.append(('correct_unspliced_ctr',correct_unspliced_ctr))
827 result.append(('incorrect_unspliced_ctr',incorrect_unspliced_ctr))
829 result.append(('correct_covered_splice_ctr',correct_covered_splice_ctr))
830 result.append(('incorrect_covered_splice_ctr',incorrect_covered_splice_ctr))
832 result.append(('pos_correct_ctr',pos_correct_ctr))
833 result.append(('pos_incorrect_ctr',pos_incorrect_ctr))
841 result.append(('total_ctr',total_ctr))
843 result.append(('skipped_ctr',skipped_ctr))
844 result.append(('min_coverage',min_coverage))
846 return result
850 def print_result(result):
851 # now that we have evaluated all instances put out all counters and sizes
852 for name,ctr in result:
853 print name,ctr
857 chunks_fn = []
858 for fn in os.listdir(current_dir):
859 if fn.startswith('chunk'):
860 chunks_fn.append(fn)
862 allPredictions = []
864 for c_fn in chunks_fn:
865 full_fn = os.path.join(current_dir,c_fn)
866 print full_fn
868 allPredictions.extend(current_chunk)
870 return allPredictions
873 def predict_on_all_chunks(current_dir,training_keys_fn):
874 """
875 We load all chunks from the current_dir belonging to one run.
876 Then we load the saved keys of the training set to restore the training and
877 testing sets.
878 Once we have done that we separately evaluate both sets.
880 """
884 allPredictionsDict = {}
885 for elem in allPredictions:
886 id = elem['id']
888 if allPredictionsDict.has_key(id):
889 old_entry = allPredictionsDict[id]
890 old_entry.append(elem)
891 allPredictionsDict[id] = old_entry
892 else:
893 allPredictionsDict[id] = [elem]
897 training_set = {}
898 for key in training_keys:
899 # we have the try construct because some of the reads used for training
900 # may not be found using vmatch at all
901 try:
902 training_set[key] = allPredictionsDict[key]
903 del allPredictionsDict[key]
904 except:
905 pass
907 test_set = allPredictionsDict
909 #test_set = {}
910 #for k in allPredictionsDict.keys()[:100]:
911 # test_set[k] = allPredictionsDict[k]