af7178c8317e326cdcb2173ef99efe29ac3b07dc
[qpalma.git] / qpalma / OutputFormat.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
16
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
19
20
21 # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
22 translation = '-acgtn_'
23
24 # convenience function for pretty-printing arrays
25 pp = lambda x : str(x)[1:-1].replace(' ','')
26
27
28 def alignment_reconstruct(current_prediction,num_exons):
29 """
30 We reconstruct the exact alignment to infer the positions of indels and the
31 sizes of the respective exons.
32 """
33
34 SpliceAlign = current_prediction['spliceAlign']
35 estAlign = current_prediction['estAlign']
36
37 dna_array = current_prediction['dna_array']
38 read_array = current_prediction['read_array']
39
40 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
41 read_array = "".join(map(lambda x: translation[int(x)],read_array))
42
43 # this array holds a number for each exon indicating its number of matches
44 exonIdentities = [0]*num_exons
45 exonGaps = [0]*num_exons
46
47 gaps = 0
48 identity = 0
49 idx = 0
50
51 for ridx in range(len(read_array)):
52 read_elem = read_array[ridx]
53 dna_elem = dna_array[ridx]
54
55 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
56 idx += 1
57 gaps = 0
58 identity = 0
59
60 if read_elem == '_':
61 continue
62
63 if read_elem == dna_elem:
64 identity += 1
65
66 if read_elem == '-':
67 gaps += 1
68
69 exonIdentities[idx] = identity
70 exonGaps[idx] = gaps
71
72 return exonIdentities,exonGaps
73
74
75 def getUniquePrediction(allPredictions):
76 """
77 Since one read can have several alignments due to several possible seed
78 positions we have to preprocess all predictions. This function returns for
79 every read only one alignment that is the highest scoring under the QPalma
80 model.
81 """
82
83 allUniquePredictions = {}
84
85 for new_prediction in allPredictions:
86 id = new_prediction['id']
87 if allUniquePredictions.has_key(id):
88 current_prediction = allUniquePredictions[id]
89
90 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
91 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
92
93 if current_a_score < new_score :
94 allUniquePredictions[id] = new_prediction
95
96 else:
97 allUniquePredictions[id] = new_prediction
98
99 return allUniquePredictions
100
101
102 def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size):
103 """
104 If we work on the negative strand we have to recalculate the indices other
105 wise we just have to add an offset i.e. the start position.
106 """
107
108 if strand == '-':
109 total_size = seqInfo.chromo_sizes[chromo]
110 start_pos = total_size - start_pos
111
112 starts = [int(elem) + start_pos for elem in starts]
113 ends = [int(elem) + start_pos for elem in ends]
114
115 if strand == '-':
116 starts = [e-window_size for e in starts]
117 ends = [e-window_size for e in ends]
118
119 return starts,ends
120
121
122 def createAlignmentOutput(settings,chunk_fn,result_fn):
123 """
124 Given all the predictions and a result file name. This function calculates
125 all aligmnment information and returns the result in the specified format.
126 """
127
128 out_fh = open(result_fn,'w+')
129 allPredictions = cPickle.load(open(chunk_fn))
130 #allPredictions = current_loc
131
132 print 'Loaded %d examples' % len(allPredictions)
133
134 # fetch the data needed
135 accessWrapper = DataAccessWrapper(settings)
136 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
137
138 # make predictions unique
139 allUniquePredictions = getUniquePrediction(allPredictions)
140
141 output_format = settings['output_format']
142
143 written_lines = 0
144 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
145
146 if output_format == 'BLAT':
147 # BLAT output
148 new_line = createBlatOutput(current_prediction,settings)
149
150 elif output_format == 'ShoRe':
151 # ShoRe output
152 new_line = createShoReOutput(current_prediction,settings)
153
154 elif output_format == 'mGene':
155 # Genefinding output for mGene
156 new_line = createGenefindingOutput(current_prediction,settings)
157
158 else:
159 assert False
160
161 out_fh.write(new_line)
162 written_lines += 1
163
164 print 'Wrote out %d lines' % written_lines
165
166
167 def createBlatOutput(current_prediction,settings):
168 """
169 This function takes one prediction as input and returns the corresponding
170 alignment in BLAT format.
171 """
172
173 # fetch the data needed
174 allowed_fragments = settings['allowed_fragments']
175 accessWrapper = DataAccessWrapper(settings)
176 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
177
178 # this is the total size of the seed region
179 window_size = 2*settings['half_window_size']
180
181 # now we fetch the data of the read itself
182 id = current_prediction['id']
183
184 seq = current_prediction['read']
185 dna = current_prediction['dna']
186
187 # we do not really need the quality values here
188 q1 = 'zzz'
189
190 chromo = current_prediction['chr']
191 strand = current_prediction['strand']
192 start_pos = current_prediction['start_pos']
193 alignment = current_prediction['alignment']
194
195 DPScores = current_prediction['DPScores']
196
197 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
198 tExonSizes,tStarts, tEnds) = alignment
199
200 if len(qExonSizes) != num_exons:
201 print 'BUG exon sizes %d'%id
202
203 new_line = ''
204 #try:
205 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
206 #except:
207 # print 'Bug in example %d (idx: %d)' %\
208 # (current_prediction['id'],pred_idx)
209 # continue
210
211 tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
212
213 ids = exonIdentities
214 ids = [int(elem) for elem in ids]
215 gaps = exonGaps
216 gaps = [int(elem) for elem in gaps]
217
218 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
219 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
220 pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
221 pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
222
223 return new_line
224
225
226 def createGenefindingOutput(current_loc,out_fname):
227 """
228 This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
229 """
230
231 #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
232 pass
233
234
235 def createShoReOutput(current_loc,out_fname):
236 """
237 This format is used by the additional steps implemented in the ShoRe pipeline.
238 """
239
240 pass