git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8634 e1793c9e...
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pydb
7 import pdb
8 import os
9 import os.path
10 import math
11
12 def prediction_on(filename):
13
14 allPredictions = cPickle.load(open(filename))
15
16 allPositions = {}
17
18
19 for current_example_pred in allPredictions:
20 gt_example = current_example_pred[0]
21
22 exampleId = gt_example['id']
23
24 for elem_nr,current_pred in enumerate(current_example_pred[1:]):
25 exampleId = current_pred['id']
26 chr = current_pred['chr']
27 strand = current_pred['strand']
28 true_cut = current_pred['true_cut']
29 start_pos = current_pred['alternative_start_pos']
30 alignment = current_pred['alignment']
31
32 predExons = current_pred['predExons']
33 trueExons = current_pred['trueExons']
34 predPositions = [elem + current_pred['alternative_start_pos'] for elem in predExons]
35 truePositions = [elem + current_pred['start_pos'] for elem in trueExons.flatten().tolist()[0]]
36
37 if len(predExons) == 4:
38 p1 = predExons[0]
39 p2 = predExons[1]
40 p3 = predExons[2]
41 p4 = predExons[3]
42
43 #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
44 #print line
45 allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
46
47
48 return allPositions
49
50
51 def writePredictions(fname,allPositions):
52
53 out_fh = open(fname,'w+')
54
55 allEntries = {}
56
57 for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
58 line = line.strip()
59 id,seq,q1,q2,q3 = line.split()
60 id = int(id)
61
62 allEntries[id] = (seq,q1,q2,q3)
63
64
65 for id,elems in allPositions.items():
66 id += 1000000000000
67 seq,q1,q2,q3 = allEntries[id]
68 chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
69
70 p1 += start_pos
71 p2 += start_pos
72 p3 += start_pos
73 p4 += start_pos
74
75 #pdb.set_trace()
76
77 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
78 tExonSizes,tStarts, tEnds) = alignment
79
80 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
81 (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
82 str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
83 str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
84 str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
85
86 out_fh.write(new_line)
87
88 out_fh.close()
89
90 def collect_prediction(current_dir):
91 """
92 Given the toplevel directory this function takes care that for each distinct
93 experiment the training and test predictions are evaluated.
94
95 """
96 train_suffix = '_allPredictions_TRAIN'
97 test_suffix = '_allPredictions_TEST'
98
99 run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
100
101 jp = os.path.join
102 b2s = ['-','+']
103
104 currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
105 QFlag = currentRun['enable_quality_scores']
106 SSFlag = currentRun['enable_splice_signals']
107 ILFlag = currentRun['enable_intron_length']
108 currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
109
110 #filename = jp(current_dir,run_name)+train_suffix
111 #print 'Prediction on: %s' % filename
112 #prediction_on(filename)
113
114 filename = jp(current_dir,run_name)+test_suffix
115 print 'Prediction on: %s' % filename
116 test_result = prediction_on(filename)
117
118 fname = 'predictions.txt'
119 writePredictions(fname,test_result)
120
121 #return train_result,test_result,currentRunId
122
123
124 if __name__ == '__main__':
125 dir = sys.argv[1]
126 #run_name = sys.argv[2]
127 collect_prediction(dir)