+ added new format for QPalma alignment output
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pydb
7 import pdb
8 import os
9 import os.path
10 import math
11
12 def prediction_on(filename):
13
14 allPredictions = cPickle.load(open(filename))
15
16 allPositions = {}
17
18
19 for current_example_pred in allPredictions:
20 gt_example = current_example_pred[0]
21
22 exampleId = gt_example['id']
23
24 for elem_nr,current_pred in enumerate(current_example_pred[1:]):
25 exampleId = current_pred['id']
26 predExons = current_pred['predExons']
27 trueExons = current_pred['trueExons']
28 predPositions = [elem + current_pred['alternative_start_pos'] for elem in predExons]
29 truePositions = [elem + current_pred['start_pos'] for elem in trueExons.flatten().tolist()[0]]
30
31 if len(predExons) == 4:
32 p1 = predExons[0]
33 p2 = predExons[1]
34 p3 = predExons[2]
35 p4 = predExons[3]
36
37 #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
38 #print line
39 allPositions[exampleId] = (chr,strand,true_cut,p1,p2,p3,p4)
40
41
42 return allPositions
43
44
45 def writePredictions(fname,allPositions)
46
47 out_fh = open(fname,'w+')
48
49 allEntries = {}
50
51 for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
52 line = line.strip()
53 id,seq,q1,q2,q3 = line.split()
54 id = int(id)
55
56 allEntries[id] = (seq,q1,q2,q3)
57
58
59 for id,elems in allPositions.items():
60 seq,q1,q2,q3 = allEntries[id]
61 chr,strand,true_cut,p1,p2,p3,p4 = elems
62
63 new_line = '%d\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n' %\
64 (id,chr,strand,seq,q1,q2,q3,true_cut,p1,p2,p3,p4)
65
66 out_fh.write(line)
67
68
69 def collect_prediction(current_dir,run_name):
70 """
71 Given the toplevel directory this function takes care that for each distinct
72 experiment the training and test predictions are evaluated.
73
74 """
75 train_suffix = '_allPredictions_TRAIN'
76 test_suffix = '_allPredictions_TEST'
77
78 jp = os.path.join
79 b2s = ['-','+']
80
81 currentRun = cPickle.load(open(jp(current_dir,'run_object.pickle')))
82 QFlag = currentRun['enable_quality_scores']
83 SSFlag = currentRun['enable_splice_signals']
84 ILFlag = currentRun['enable_intron_length']
85 currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
86
87 filename = jp(current_dir,run_name)+train_suffix
88 print 'Prediction on: %s' % filename
89 prediction_on(filename)
90
91 #filename = jp(current_dir,run_name)+test_suffix
92 #print 'Prediction on: %s' % filename
93 #test_result = prediction_on(filename)
94
95 #return train_result,test_result,currentRunId
96
97
98 if __name__ == '__main__':
99 dir = sys.argv[1]
100 run_name = sys.argv[2]
101 collect_prediction(dir,run_name)