+ extended alignment script
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pydb
7 import pdb
8 import os
9 import os.path
10 import math
11
12 def prediction_on(filename):
13
14 allPredictions = cPickle.load(open(filename))
15
16 allPositions = {}
17
18
19 for current_example_pred in allPredictions:
20 gt_example = current_example_pred[0]
21
22 exampleId = gt_example['id']
23
24 for elem_nr,current_pred in enumerate(current_example_pred[1:]):
25 exampleId = current_pred['id']
26 chr = current_pred['chr']
27 strand = current_pred['strand']
28 true_cut = current_pred['true_cut']
29 start_pos = current_pred['alternative_start_pos']
30
31 predExons = current_pred['predExons']
32 trueExons = current_pred['trueExons']
33 predPositions = [elem + current_pred['alternative_start_pos'] for elem in predExons]
34 truePositions = [elem + current_pred['start_pos'] for elem in trueExons.flatten().tolist()[0]]
35
36 if len(predExons) == 4:
37 p1 = predExons[0]
38 p2 = predExons[1]
39 p3 = predExons[2]
40 p4 = predExons[3]
41
42 #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
43 #print line
44 allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4)
45
46
47 return allPositions
48
49
50 def writePredictions(fname,allPositions):
51
52 out_fh = open(fname,'w+')
53
54 allEntries = {}
55
56 for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
57 line = line.strip()
58 id,seq,q1,q2,q3 = line.split()
59 id = int(id)
60
61 allEntries[id] = (seq,q1,q2,q3)
62
63
64 for id,elems in allPositions.items():
65 seq,q1,q2,q3 = allEntries[1000000000000+id]
66 chr,strand,start_pos,true_cut,p1,p2,p3,p4 = elems
67
68 p1 += start_pos
69 p2 += start_pos
70 p3 += start_pos
71 p4 += start_pos
72
73 new_line = '%d\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n' %\
74 (id,chr,strand,seq,q1,q2,q3,true_cut,p1,p2,p3,p4)
75
76 out_fh.write(new_line)
77
78 out_fh.close()
79
80 def collect_prediction(current_dir):
81 """
82 Given the toplevel directory this function takes care that for each distinct
83 experiment the training and test predictions are evaluated.
84
85 """
86 train_suffix = '_allPredictions_TRAIN'
87 test_suffix = '_allPredictions_TEST'
88
89 run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
90
91 jp = os.path.join
92 b2s = ['-','+']
93
94 currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
95 QFlag = currentRun['enable_quality_scores']
96 SSFlag = currentRun['enable_splice_signals']
97 ILFlag = currentRun['enable_intron_length']
98 currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
99
100 #filename = jp(current_dir,run_name)+train_suffix
101 #print 'Prediction on: %s' % filename
102 #prediction_on(filename)
103
104 filename = jp(current_dir,run_name)+test_suffix
105 print 'Prediction on: %s' % filename
106 test_result = prediction_on(filename)
107
108 fname = 'predictions.txt'
109 writePredictions(fname,test_result)
110
111 #return train_result,test_result,currentRunId
112
113
114 if __name__ == '__main__':
115 dir = sys.argv[1]
116 #run_name = sys.argv[2]
117 collect_prediction(dir)