+ added faster evaluation function
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 from qpalma.parsers import *
11
12 def prediction_on(filename,filtered_reads,out_fname):
13
14 print 'parsing filtered reads..'
15 all_filtered_reads = parse_filtered_reads(filtered_reads)
16 print 'found %d filtered reads' % len(all_filtered_reads)
17
18 out_fh = open(out_fname,'w+')
19
20 allPredictions = cPickle.load(open(filename))
21 allPositions = {}
22
23 for current_prediction in allPredictions:
24 id = current_prediction['id']
25 current_ground_truth = all_filtered_reads[id]
26
27 true_cut = current_ground_truth['true_cut']
28 seq = current_ground_truth['seq']
29 q1 = current_ground_truth['prb']
30
31 chr = current_prediction['chr']
32 strand = current_prediction['strand']
33 #true_cut = current_prediction['true_cut']
34 start_pos = current_prediction['start_pos']
35 alignment = current_prediction['alignment']
36
37 predExons = current_prediction['predExons']
38
39 if len(predExons) == 4:
40 p1 = predExons[0]
41 p2 = predExons[1]
42 p3 = predExons[2]
43 p4 = predExons[3]
44
45 elif len(predExons) == 2:
46 p1 = predExons[0]
47 p2 = -1
48 p3 = -1
49 p4 = predExons[1]
50
51 #allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
52
53 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
54 tExonSizes,tStarts, tEnds) = alignment
55
56 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
57 (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
58 str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
59 str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
60 str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
61
62 out_fh.write(new_line)
63
64 return allPositions
65
66
67 def writePredictions(fname,allPositions):
68
69
70 allEntries = {}
71
72 for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
73 line = line.strip()
74 id,seq,q1,q2,q3 = line.split()
75 id = int(id)
76
77 allEntries[id] = (seq,q1,q2,q3)
78
79
80 for id,elems in allPositions.items():
81 seq,q1,q2,q3 = allEntries[id]
82 chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
83
84 p1 += start_pos
85 p2 += start_pos
86 p3 += start_pos
87 p4 += start_pos
88
89
90
91 out_fh.close()
92
93 def collect_prediction(current_dir):
94 """
95 Given the toplevel directory this function takes care that for each distinct
96 experiment the training and test predictions are evaluated.
97
98 """
99 train_suffix = '_allPredictions_TRAIN'
100 test_suffix = '_allPredictions_TEST'
101
102 run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
103
104 jp = os.path.join
105 b2s = ['-','+']
106
107 #currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
108 #QFlag = currentRun['enable_quality_scores']
109 #SSFlag = currentRun['enable_splice_signals']
110 #ILFlag = currentRun['enable_intron_length']
111 #currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
112
113 filename = jp(current_dir,run_name)+test_suffix
114 print 'Prediction on: %s' % filename
115 test_result = prediction_on(filename)
116
117 fname = 'predictions.txt'
118 writePredictions(fname,test_result)
119
120
121 if __name__ == '__main__':
122 filename = sys.argv[1]
123 filtered_reads = sys.argv[2]
124 out_fh = sys.argv[3]
125 prediction_on(filename,filtered_reads,out_fh)