+ extended pipeline code
[qpalma.git] / qpalma / OutputFormat.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
16
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
19
20
21 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
22
23
24 def alignment_reconstruct(current_prediction,num_exons):
25 """
26 We reconstruct the exact alignment to infer the positions of indels and the
27 sizes of the respective exons.
28 """
29
30 SpliceAlign = current_prediction['spliceAlign']
31 estAlign = current_prediction['estAlign']
32
33 dna_array = current_prediction['dna_array']
34 read_array = current_prediction['read_array']
35
36 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
37 read_array = "".join(map(lambda x: translation[int(x)],read_array))
38
39 # this array holds a number for each exon indicating its number of matches
40 exonIdentities = [0]*num_exons
41 exonGaps = [0]*num_exons
42
43 gaps = 0
44 identity = 0
45 idx = 0
46
47 for ridx in range(len(read_array)):
48 read_elem = read_array[ridx]
49 dna_elem = dna_array[ridx]
50
51 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
52 idx += 1
53 gaps = 0
54 identity = 0
55
56 if read_elem == '_':
57 continue
58
59 if read_elem == dna_elem:
60 identity += 1
61
62 if read_elem == '-':
63 gaps += 1
64
65 exonIdentities[idx] = identity
66 exonGaps[idx] = gaps
67
68 return exonIdentities,exonGaps
69
70
71 def getUniquePrediction(allPredictions):
72 """
73 Since one read can have several alignments due to several possible seed
74 positions we have to preprocess all prediction. This function return for
75 every read only one alignment that is the highest scoring under the QPalma
76 model.
77 """
78
79 allUniquePredictions = {}
80
81 for new_prediction in allPredictions:
82 id = new_prediction['id']
83 if allUniquePredictions.has_key(id):
84 current_prediction = allUniquePredictions[id]
85
86 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
87 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
88
89 if current_a_score < new_score :
90 allUniquePredictions[id] = new_prediction
91
92 else:
93 allUniquePredictions[id] = new_prediction
94
95 return allUniquePredictions
96
97
98 def createOutput(current_loc,out_fname,output_format):
99 """
100 Given all the predictions and a result file name. This function calculates
101 all aligmnment information and returns the result in the specified format.
102 """
103
104 out_fh = open(out_fname,'w+')
105 allPredictions = cPickle.load(open(current_loc))
106 #allPredictions = current_loc
107
108 print 'Loaded %d examples' % len(allPredictions)
109
110 # fetch the data needed
111 accessWrapper = DataAccessWrapper(settings)
112 seqInfo = SeqSpliceInfo(accessWrapper,,settings['allowed_fragments'])
113
114 # make predictions unique
115 allUniquePredictions = getUniquePrediction(allPredictions)
116
117 written_lines = 0
118 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
119
120 if output_format == 'BLAT':
121 # BLAT output
122 new_line = createBlatOutput(current_prediction)
123
124 elif output_format == 'ShoRe':
125 # ShoRe output
126 new_line = createShoReOutput(current_prediction)
127
128 elif output_format == 'mGene':
129 # Genefinding output for mGene
130 new_line = createGenefindingOutput(current_prediction)
131
132 else:
133 assert False
134
135 out_fh.write(new_line)
136 written_lines += 1
137
138 print 'Wrote out %d lines' % written_lines
139
140
141 def createBlatOutput(current_prediction):
142 """
143 This function takes one prediction as input and returns the corresponding
144 alignment in BLAT format.
145 """
146
147 id = current_prediction['id']
148
149 seq = current_prediction['read']
150 dna = current_prediction['dna']
151
152 # we do not really need the quality values here
153 q1 = 'zzz'
154
155 chromo = current_prediction['chr']
156 strand = current_prediction['strand']
157 start_pos = current_prediction['start_pos']
158 alignment = current_prediction['alignment']
159
160 DPScores = current_prediction['DPScores']
161 predExons = current_prediction['predExons']
162 predExons = [e+start_pos for e in predExons]
163
164 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
165 tExonSizes,tStarts, tEnds) = alignment
166
167 if len(qExonSizes) != num_exons:
168 print 'BUG exon sizes %d'%id
169 continue
170
171 EST2GFF = False
172 new_line = ''
173
174 if EST2GFF:
175 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
176
177 match = predExons[0,1] - predExons[0,0]
178 if predExons.shape == (2,2):
179 match += predExons[1,1] - predExons[1,0]
180
181 mismatch = 0
182 repmatch = 0
183 Ns = 0
184 Qgapcnt = 0
185 Qgapbs = 0
186
187 Tgapcnt = 0
188 Tgapbs = 0
189
190 if predExons.shape == (2,2):
191 Tgapbs += predExons[1,0] - predExons[0,1]
192 Tgapcnt = 1
193
194 # &strand, Qname, &Qsize,
195 # &Qstart, &Qend, Tname, &Tsize,
196 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
197 #
198 # ATTENTION
199 #
200 # we enforce equal exons sizes for q and t because we are missing indel
201 # positions of the alignment
202
203 if qExonSizes[0] != tExonSizes[0]:
204 continue
205
206 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
207 Qsize = len(seq)
208 Qstart = qStart
209 Qend = qEnd
210 Tname = 'CHR%d'%chromo
211
212 start_pos -= 2
213
214 Tsize = tEnd+1 + start_pos
215 Tstart = tStart + start_pos
216 Tend = tEnd + start_pos
217 blockcnt = Tgapcnt+1
218 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
219 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
220 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
221
222 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
223 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
224 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
225 Tname, Tsize, Tstart, Tend,\
226 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
227
228 else:
229 #try:
230 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
231 #except:
232 # print 'Bug in example %d (idx: %d)' %\
233 # (current_prediction['id'],pred_idx)
234 # continue
235
236 #t_size = tEnd - tStart
237 #read_size = 38
238 #if strand == '-':
239 # total_size = seqInfo.chromo_sizes[chromo]
240
241 # start_pos = total_size - start_pos
242
243 # qExonSizes.reverse()
244 # qStarts = [read_size-e for e in qEnds]
245 # qStarts.reverse()
246 # qEnds = [read_size-e for e in qStarts]
247 # qEnds.reverse()
248
249 # tExonSizes.reverse()
250 # tStarts = [t_size-e for e in tEnds]
251 # tStarts.reverse()
252 # tEnds = [t_size-e for e in tStarts]
253 # tEnds.reverse()
254
255 # exonIdentities.reverse()
256 # exonGaps.reverse()
257
258 pp = lambda x : str(x)[1:-1].replace(' ','')
259
260 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
261 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
262 pp(qExonSizes),pp(qStarts),\
263 pp(qEnds),pp(tExonSizes),\
264 pp(tStarts),pp(tEnds),\
265 pp(exonIdentities),pp(exonGaps))
266
267 return new_line
268
269
270 def createGenefindingOutput(current_loc,out_fname):
271 """
272 """
273 pass
274
275
276 def createShoReOutput(current_loc,out_fname):
277 """
278 """
279 pass