85c41cd438eeceaec12dade87473b3b436acd903
[qpalma.git] / qpalma / OutputFormat.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
16
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
19
20
21 # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
22 translation = '-acgtn_'
23
24 # convenience function for pretty-printing arrays
25 pp = lambda x : str(x)[1:-1].replace(' ',' ')
26
27
28 def alignment_reconstruct(current_prediction,num_exons):
29 """
30 We reconstruct the exact alignment to infer the positions of indels and the
31 sizes of the respective exons.
32 """
33
34 SpliceAlign = current_prediction['spliceAlign']
35 estAlign = current_prediction['estAlign']
36
37 dna_array = current_prediction['dna_array']
38 read_array = current_prediction['read_array']
39
40 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
41 read_array = "".join(map(lambda x: translation[int(x)],read_array))
42
43 # this array holds a number for each exon indicating its number of matches
44 exonIdentities = [0]*num_exons
45 exonGaps = [0]*num_exons
46
47 gaps = 0
48 identity = 0
49 idx = 0
50
51 for ridx in range(len(read_array)):
52 read_elem = read_array[ridx]
53 dna_elem = dna_array[ridx]
54
55 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
56 idx += 1
57 gaps = 0
58 identity = 0
59
60 if read_elem == '_':
61 continue
62
63 if read_elem == dna_elem:
64 identity += 1
65
66 if read_elem == '-':
67 gaps += 1
68
69 exonIdentities[idx] = identity
70 exonGaps[idx] = gaps
71
72 return exonIdentities,exonGaps
73
74
75 def getUniquePrediction(allPredictions):
76 """
77 Since one read can have several alignments due to several possible seed
78 positions we have to preprocess all predictions. This function returns for
79 every read only one alignment that is the highest scoring under the QPalma
80 model.
81 """
82
83 allUniquePredictions = {}
84
85 for new_prediction in allPredictions:
86 id = new_prediction['id']
87 if allUniquePredictions.has_key(id):
88 current_prediction = allUniquePredictions[id]
89
90 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
91 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
92
93 if current_a_score < new_score :
94 allUniquePredictions[id] = new_prediction
95
96 else:
97 allUniquePredictions[id] = new_prediction
98
99 return allUniquePredictions
100
101
102 def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size):
103 """
104 If we work on the negative strand we have to recalculate the indices other
105 wise we just have to add an offset i.e. the start position.
106 """
107
108 print 'window_size is %d' % window_size
109 print start_pos
110 print starts
111 print ends
112
113 if strand == '-':
114 _ends = [window_size - int(elem) + 2 for elem in starts]
115 _starts = [window_size - int(elem) + 2 for elem in ends]
116 starts = _starts
117 ends = _ends
118
119 print start_pos
120 print starts
121 print ends
122
123 starts = [int(elem) + start_pos-1 for elem in starts]
124 ends = [int(elem) + start_pos-1 for elem in ends]
125
126 return starts,ends
127
128
129 def createAlignmentOutput(settings,chunk_fn,result_fn):
130 """
131 Given all the predictions and a result file name. This function calculates
132 all aligmnment information and returns the result in the specified format.
133 """
134
135 out_fh = open(result_fn,'w+')
136 allPredictions = cPickle.load(open(chunk_fn))
137 #allPredictions = current_loc
138
139 print 'Loaded %d examples' % len(allPredictions)
140
141 # fetch the data needed
142 accessWrapper = DataAccessWrapper(settings)
143 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
144
145 # make predictions unique
146 allUniquePredictions = getUniquePrediction(allPredictions)
147
148 output_format = settings['output_format']
149
150 written_lines = 0
151 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
152
153 if output_format == 'blat':
154 # blat output
155 new_line = createBlatOutput(current_prediction,settings)
156
157 elif output_format == 'ShoRe':
158 # ShoRe output
159 new_line = createShoReOutput(current_prediction,settings)
160
161 elif output_format == 'mGene':
162 # Genefinding output for mGene
163 new_line = createGenefindingOutput(current_prediction,settings)
164
165 else:
166 assert False
167
168 out_fh.write(new_line)
169 written_lines += 1
170
171 print 'Wrote out %d lines' % written_lines
172
173
174 def createBlatOutput(current_prediction,settings):
175 """
176 This function takes one prediction as input and returns the corresponding
177 alignment in blat format:
178
179 1. matches - Number of bases that match that aren't repeats
180 2. misMatches - Number of bases that don't match
181 3. repMatches - Number of bases that match but are part of repeats
182 4. nCount - Number of 'N' bases
183 5. qNumInsert - Number of inserts in query
184 6. qBaseInsert - Number of bases inserted in query
185 7. tNumInsert - Number of inserts in target
186 8. tBaseInsert - Number of bases inserted in target
187 9. strand - '+' or '-' for query strand. In mouse, second '+'or '-' is for genomic strand
188 10. qName - Query sequence name
189 11. qSize - Query sequence size
190 12. qStart - Alignment start position in query
191 13. qEnd - Alignment end position in query
192 14. tName - Target sequence name
193 15. tSize - Target sequence size
194 16. tStart - Alignment start position in target
195 17. tEnd - Alignment end position in target
196 18. blockCount - Number of blocks in the alignment
197 19. blockSizes - Comma-separated list of sizes of each block
198 20. qStarts - Comma-separated list of starting positions of each block in query
199 21. tStarts - Comma-separated list of starting positions of each block in target
200
201 """
202
203 # fetch the data needed
204 allowed_fragments = settings['allowed_fragments']
205 accessWrapper = DataAccessWrapper(settings)
206 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
207
208 # this is the total size of the seed region
209 window_size = 2*settings['half_window_size']
210
211 # now we fetch the data of the read itself
212 id = current_prediction['id']
213
214 seq = current_prediction['read']
215 dna = current_prediction['dna']
216
217 # we do not really need the quality values here
218 q1 = 'zzz'
219
220 chromo = current_prediction['chr']
221 strand = current_prediction['strand']
222 start_pos = current_prediction['start_pos']
223 alignment = current_prediction['alignment']
224
225 DPScores = current_prediction['DPScores']
226
227 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
228 tExonSizes,tStarts, tEnds) = alignment
229
230 if len(qExonSizes) != num_exons:
231 print 'BUG exon sizes %d'%id
232
233 new_line = ''
234 #try:
235 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
236 #except:
237 # print 'Bug in example %d (idx: %d)' %\
238 # (current_prediction['id'],pred_idx)
239 # continue
240
241 tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
242
243 ids = exonIdentities
244 ids = [int(elem) for elem in ids]
245 gaps = exonGaps
246 gaps = [int(elem) for elem in gaps]
247
248 new_line = '%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
249 (id,chromo,strand,seq,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
250 pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
251 pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
252
253 return new_line
254
255
256 def createGenefindingOutput(current_loc,out_fname):
257 """
258 This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
259 """
260
261 #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
262 pass
263
264
265 def createShoReOutput(current_loc,out_fname):
266 """
267 This format is used by the additional steps implemented in the ShoRe pipeline.
268 """
269
270 pass