+ minor changes in the paths
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import math
6 import numpy
7 import os
8 import os.path
9 import pdb
10 import sys
11
12 import qparser
13
14 from Utils import pprint_alignment
15
16 import palma.palma_utils as pu
17 from palma.output_formating import print_results
18
19 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
20
21
22 def alignment_reconstruct(current_prediction,num_exons):
23 """
24 We reconstruct the exact alignment to infer the positions of indels and the
25 sizes of the respective exons.
26 """
27 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
28
29 SpliceAlign = current_prediction['spliceAlign']
30 estAlign = current_prediction['estAlign']
31
32 dna_array = current_prediction['dna_array']
33 read_array = current_prediction['read_array']
34
35 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
36 read_array = "".join(map(lambda x: translation[int(x)],read_array))
37
38 # this array holds a number for each exon indicating its number of matches
39 exonIdentities = [0]*num_exons
40 exonGaps = [0]*num_exons
41
42 gaps = 0
43 identity = 0
44 idx = 0
45
46 for ridx in range(len(read_array)):
47 read_elem = read_array[ridx]
48 dna_elem = dna_array[ridx]
49
50 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
51 idx += 1
52 gaps = 0
53 identity = 0
54
55 if read_elem == '_':
56 continue
57
58 if read_elem == dna_elem:
59 identity += 1
60
61 if read_elem == '-':
62 gaps += 1
63
64 exonIdentities[idx] = identity
65 exonGaps[idx] = gaps
66
67 return exonIdentities,exonGaps
68
69
70 def create_alignment_file(current_loc,out_fname):
71
72 out_fh = open(out_fname,'w+')
73 allPredictions = cPickle.load(open(current_loc))
74 #allPredictions = current_loc
75
76 print 'Loaded %d examples' % len(allPredictions)
77
78 # fetch the data needed
79 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
80 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
81 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
82
83 g_fmt = 'chr%d.dna.flat'
84 s_fmt = 'contig_%d%s'
85
86 num_chromo = 6
87
88 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
89 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
90
91 # Uniqueness filtering using alignment score for reads with several
92 # alignments
93 allUniquePredictions = {}
94
95 for new_prediction in allPredictions:
96 id = new_prediction['id']
97 if allUniquePredictions.has_key(id):
98 current_prediction = allUniquePredictions[id]
99
100 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
101 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
102
103 if current_a_score < new_score :
104 allUniquePredictions[id] = new_prediction
105
106 else:
107 allUniquePredictions[id] = new_prediction
108
109 written_lines = 0
110 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
111
112 id = current_prediction['id']
113
114 #current_ground_truth = qparser.fetch_read(id)
115 #true_cut = current_ground_truth['true_cut']
116 #seq = current_ground_truth['seq']
117 #q1 = current_ground_truth['prb']
118
119 seq = current_prediction['read']
120 dna = current_prediction['dna']
121
122 # CHECK !!!
123 q1 = 'zzz'
124
125 chromo = current_prediction['chr']
126 strand = current_prediction['strand']
127 start_pos = current_prediction['start_pos']
128 alignment = current_prediction['alignment']
129
130 DPScores = current_prediction['DPScores']
131 predExons = current_prediction['predExons']
132 predExons = [e+start_pos for e in predExons]
133
134 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
135 tExonSizes,tStarts, tEnds) = alignment
136
137 if len(qExonSizes) != num_exons:
138 print 'BUG exon sizes %d'%id
139 continue
140
141 EST2GFF = False
142 new_line = ''
143
144 if EST2GFF:
145 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
146
147 match = predExons[0,1] - predExons[0,0]
148 if predExons.shape == (2,2):
149 match += predExons[1,1] - predExons[1,0]
150
151 mismatch = 0
152 repmatch = 0
153 Ns = 0
154 Qgapcnt = 0
155 Qgapbs = 0
156
157 Tgapcnt = 0
158 Tgapbs = 0
159
160 if predExons.shape == (2,2):
161 Tgapbs += predExons[1,0] - predExons[0,1]
162 Tgapcnt = 1
163
164 # &strand, Qname, &Qsize,
165 # &Qstart, &Qend, Tname, &Tsize,
166 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
167 #
168 # ATTENTION
169 #
170 # we enforce equal exons sizes for q and t because we are missing indel
171 # positions of the alignment
172
173 if qExonSizes[0] != tExonSizes[0]:
174 continue
175
176 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
177 Qsize = len(seq)
178 Qstart = qStart
179 Qend = qEnd
180 Tname = 'CHR%d'%chromo
181
182 start_pos -= 2
183
184 Tsize = tEnd+1 + start_pos
185 Tstart = tStart + start_pos
186 Tend = tEnd + start_pos
187 blockcnt = Tgapcnt+1
188 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
189 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
190 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
191
192 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
193 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
194 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
195 Tname, Tsize, Tstart, Tend,\
196 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
197
198 else:
199 #try:
200 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
201 #except:
202 # print 'Bug in example %d (idx: %d)' %\
203 # (current_prediction['id'],pred_idx)
204 # continue
205
206 #t_size = tEnd - tStart
207 #read_size = 38
208 #if strand == '-':
209 # total_size = seqInfo.chromo_sizes[chromo]
210
211 # start_pos = total_size - start_pos
212
213 # qExonSizes.reverse()
214 # qStarts = [read_size-e for e in qEnds]
215 # qStarts.reverse()
216 # qEnds = [read_size-e for e in qStarts]
217 # qEnds.reverse()
218
219 # tExonSizes.reverse()
220 # tStarts = [t_size-e for e in tEnds]
221 # tStarts.reverse()
222 # tEnds = [t_size-e for e in tStarts]
223 # tEnds.reverse()
224
225 # exonIdentities.reverse()
226 # exonGaps.reverse()
227
228 pp = lambda x : str(x)[1:-1].replace(' ','')
229
230 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
231 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
232 pp(qExonSizes),pp(qStarts),\
233 pp(qEnds),pp(tExonSizes),\
234 pp(tStarts),pp(tEnds),\
235 pp(exonIdentities),pp(exonGaps))
236
237 out_fh.write(new_line)
238 written_lines += 1
239
240 print 'Wrote out %d lines' % written_lines
241
242
243 if __name__ == '__main__':
244 current_dir = sys.argv[1]
245 out_fh = sys.argv[2]
246 create_alignment_file(current_dir,out_fh)