+ got rid of some legacy code
[qpalma.git] / qpalma / OutputFormat.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
16
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
19
20
21 # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
22 translation = '-acgtn_'
23
24 # convenience function for pretty-printing arrays
25 pp = lambda x : str(x)[1:-1].replace(' ','')
26
27
28 def alignment_reconstruct(current_prediction,num_exons):
29 """
30 We reconstruct the exact alignment to infer the positions of indels and the
31 sizes of the respective exons.
32 """
33
34 SpliceAlign = current_prediction['spliceAlign']
35 estAlign = current_prediction['estAlign']
36
37 dna_array = current_prediction['dna_array']
38 read_array = current_prediction['read_array']
39
40 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
41 read_array = "".join(map(lambda x: translation[int(x)],read_array))
42
43 # this array holds a number for each exon indicating its number of matches
44 exonIdentities = [0]*num_exons
45 exonGaps = [0]*num_exons
46
47 gaps = 0
48 identity = 0
49 idx = 0
50
51 for ridx in range(len(read_array)):
52 read_elem = read_array[ridx]
53 dna_elem = dna_array[ridx]
54
55 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
56 idx += 1
57 gaps = 0
58 identity = 0
59
60 if read_elem == '_':
61 continue
62
63 if read_elem == dna_elem:
64 identity += 1
65
66 if read_elem == '-':
67 gaps += 1
68
69 exonIdentities[idx] = identity
70 exonGaps[idx] = gaps
71
72 return exonIdentities,exonGaps
73
74
75 def getUniquePrediction(allPredictions):
76 """
77 Since one read can have several alignments due to several possible seed
78 positions we have to preprocess all predictions. This function returns for
79 every read only one alignment that is the highest scoring under the QPalma
80 model.
81 """
82
83 allUniquePredictions = {}
84
85 for new_prediction in allPredictions:
86 id = new_prediction['id']
87 if allUniquePredictions.has_key(id):
88 current_prediction = allUniquePredictions[id]
89
90 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
91 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
92
93 if current_a_score < new_score :
94 allUniquePredictions[id] = new_prediction
95
96 else:
97 allUniquePredictions[id] = new_prediction
98
99 return allUniquePredictions
100
101
102 def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size):
103 """
104 If we work on the negative strand we have to recalculate the indices other
105 wise we just have to add an offset i.e. the start position.
106 """
107
108 if strand == '-':
109 total_size = seqInfo.chromo_sizes[chromo]
110 start_pos = total_size - start_pos
111
112 if strand == '+':
113 starts = [int(elem) + start_pos-2 for elem in starts]
114 ends = [int(elem) + start_pos-2 for elem in ends]
115 else:
116 starts = [int(elem) + start_pos for elem in starts]
117 ends = [int(elem) + start_pos for elem in ends]
118
119 if strand == '-':
120 starts = [e-window_size for e in starts]
121 ends = [e-window_size for e in ends]
122
123 return starts,ends
124
125
126 def createAlignmentOutput(settings,chunk_fn,result_fn):
127 """
128 Given all the predictions and a result file name. This function calculates
129 all aligmnment information and returns the result in the specified format.
130 """
131
132 out_fh = open(result_fn,'w+')
133 allPredictions = cPickle.load(open(chunk_fn))
134 #allPredictions = current_loc
135
136 print 'Loaded %d examples' % len(allPredictions)
137
138 # fetch the data needed
139 accessWrapper = DataAccessWrapper(settings)
140 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
141
142 # make predictions unique
143 allUniquePredictions = getUniquePrediction(allPredictions)
144
145 output_format = settings['output_format']
146
147 written_lines = 0
148 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
149
150 if output_format == 'BLAT':
151 # BLAT output
152 new_line = createBlatOutput(current_prediction,settings)
153
154 elif output_format == 'ShoRe':
155 # ShoRe output
156 new_line = createShoReOutput(current_prediction,settings)
157
158 elif output_format == 'mGene':
159 # Genefinding output for mGene
160 new_line = createGenefindingOutput(current_prediction,settings)
161
162 else:
163 assert False
164
165 out_fh.write(new_line)
166 written_lines += 1
167
168 print 'Wrote out %d lines' % written_lines
169
170
171 def createBlatOutput(current_prediction,settings):
172 """
173 This function takes one prediction as input and returns the corresponding
174 alignment in BLAT format:
175
176 1. matches - Number of bases that match that aren't repeats
177 2. misMatches - Number of bases that don't match
178 3. repMatches - Number of bases that match but are part of repeats
179 4. nCount - Number of 'N' bases
180 5. qNumInsert - Number of inserts in query
181 6. qBaseInsert - Number of bases inserted in query
182 7. tNumInsert - Number of inserts in target
183 8. tBaseInsert - Number of bases inserted in target
184 9. strand - '+' or '-' for query strand. In mouse, second '+'or '-' is for genomic strand
185 10. qName - Query sequence name
186 11. qSize - Query sequence size
187 12. qStart - Alignment start position in query
188 13. qEnd - Alignment end position in query
189 14. tName - Target sequence name
190 15. tSize - Target sequence size
191 16. tStart - Alignment start position in target
192 17. tEnd - Alignment end position in target
193 18. blockCount - Number of blocks in the alignment
194 19. blockSizes - Comma-separated list of sizes of each block
195 20. qStarts - Comma-separated list of starting positions of each block in query
196 21. tStarts - Comma-separated list of starting positions of each block in target
197
198 """
199
200 # fetch the data needed
201 allowed_fragments = settings['allowed_fragments']
202 accessWrapper = DataAccessWrapper(settings)
203 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
204
205 # this is the total size of the seed region
206 window_size = 2*settings['half_window_size']
207
208 # now we fetch the data of the read itself
209 id = current_prediction['id']
210
211 seq = current_prediction['read']
212 dna = current_prediction['dna']
213
214 # we do not really need the quality values here
215 q1 = 'zzz'
216
217 chromo = current_prediction['chr']
218 strand = current_prediction['strand']
219 start_pos = current_prediction['start_pos']
220 alignment = current_prediction['alignment']
221
222 DPScores = current_prediction['DPScores']
223
224 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
225 tExonSizes,tStarts, tEnds) = alignment
226
227 if len(qExonSizes) != num_exons:
228 print 'BUG exon sizes %d'%id
229
230 new_line = ''
231 #try:
232 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
233 #except:
234 # print 'Bug in example %d (idx: %d)' %\
235 # (current_prediction['id'],pred_idx)
236 # continue
237
238 tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
239
240 ids = exonIdentities
241 ids = [int(elem) for elem in ids]
242 gaps = exonGaps
243 gaps = [int(elem) for elem in gaps]
244
245 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
246 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
247 pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
248 pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
249
250 return new_line
251
252
253 def createGenefindingOutput(current_loc,out_fname):
254 """
255 This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
256 """
257
258 #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
259 pass
260
261
262 def createShoReOutput(current_loc,out_fname):
263 """
264 This format is used by the additional steps implemented in the ShoRe pipeline.
265 """
266
267 pass