af7178c8317e326cdcb2173ef99efe29ac3b07dc
1 # This program is free software; you can redistribute it and/or modify
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
21 # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
22 translation = '-acgtn_'
24 # convenience function for pretty-printing arrays
25 pp = lambda x : str(x)[1:-1].replace(' ','')
28 def alignment_reconstruct(current_prediction,num_exons):
29 """
30 We reconstruct the exact alignment to infer the positions of indels and the
31 sizes of the respective exons.
32 """
34 SpliceAlign = current_prediction['spliceAlign']
35 estAlign = current_prediction['estAlign']
37 dna_array = current_prediction['dna_array']
40 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
43 # this array holds a number for each exon indicating its number of matches
44 exonIdentities = [0]*num_exons
45 exonGaps = [0]*num_exons
47 gaps = 0
48 identity = 0
49 idx = 0
53 dna_elem = dna_array[ridx]
55 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
56 idx += 1
57 gaps = 0
58 identity = 0
61 continue
64 identity += 1
67 gaps += 1
69 exonIdentities[idx] = identity
70 exonGaps[idx] = gaps
72 return exonIdentities,exonGaps
75 def getUniquePrediction(allPredictions):
76 """
77 Since one read can have several alignments due to several possible seed
78 positions we have to preprocess all predictions. This function returns for
79 every read only one alignment that is the highest scoring under the QPalma
80 model.
81 """
83 allUniquePredictions = {}
85 for new_prediction in allPredictions:
86 id = new_prediction['id']
87 if allUniquePredictions.has_key(id):
88 current_prediction = allUniquePredictions[id]
90 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
91 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
93 if current_a_score < new_score :
94 allUniquePredictions[id] = new_prediction
96 else:
97 allUniquePredictions[id] = new_prediction
99 return allUniquePredictions
102 def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size):
103 """
104 If we work on the negative strand we have to recalculate the indices other
105 wise we just have to add an offset i.e. the start position.
106 """
108 if strand == '-':
109 total_size = seqInfo.chromo_sizes[chromo]
110 start_pos = total_size - start_pos
112 starts = [int(elem) + start_pos for elem in starts]
113 ends = [int(elem) + start_pos for elem in ends]
115 if strand == '-':
116 starts = [e-window_size for e in starts]
117 ends = [e-window_size for e in ends]
119 return starts,ends
122 def createAlignmentOutput(settings,chunk_fn,result_fn):
123 """
124 Given all the predictions and a result file name. This function calculates
125 all aligmnment information and returns the result in the specified format.
126 """
128 out_fh = open(result_fn,'w+')
130 #allPredictions = current_loc
132 print 'Loaded %d examples' % len(allPredictions)
134 # fetch the data needed
135 accessWrapper = DataAccessWrapper(settings)
136 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
138 # make predictions unique
139 allUniquePredictions = getUniquePrediction(allPredictions)
141 output_format = settings['output_format']
143 written_lines = 0
144 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
146 if output_format == 'BLAT':
147 # BLAT output
148 new_line = createBlatOutput(current_prediction,settings)
150 elif output_format == 'ShoRe':
151 # ShoRe output
152 new_line = createShoReOutput(current_prediction,settings)
154 elif output_format == 'mGene':
155 # Genefinding output for mGene
156 new_line = createGenefindingOutput(current_prediction,settings)
158 else:
159 assert False
161 out_fh.write(new_line)
162 written_lines += 1
164 print 'Wrote out %d lines' % written_lines
167 def createBlatOutput(current_prediction,settings):
168 """
169 This function takes one prediction as input and returns the corresponding
170 alignment in BLAT format.
171 """
173 # fetch the data needed
174 allowed_fragments = settings['allowed_fragments']
175 accessWrapper = DataAccessWrapper(settings)
176 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
178 # this is the total size of the seed region
179 window_size = 2*settings['half_window_size']
181 # now we fetch the data of the read itself
182 id = current_prediction['id']
185 dna = current_prediction['dna']
187 # we do not really need the quality values here
188 q1 = 'zzz'
190 chromo = current_prediction['chr']
191 strand = current_prediction['strand']
192 start_pos = current_prediction['start_pos']
193 alignment = current_prediction['alignment']
195 DPScores = current_prediction['DPScores']
197 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
198 tExonSizes,tStarts, tEnds) = alignment
200 if len(qExonSizes) != num_exons:
201 print 'BUG exon sizes %d'%id
203 new_line = ''
204 #try:
205 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
206 #except:
207 # print 'Bug in example %d (idx: %d)' %\
208 # (current_prediction['id'],pred_idx)
209 # continue
211 tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
213 ids = exonIdentities
214 ids = [int(elem) for elem in ids]
215 gaps = exonGaps
216 gaps = [int(elem) for elem in gaps]
218 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
219 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
220 pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
221 pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
223 return new_line
226 def createGenefindingOutput(current_loc,out_fname):
227 """
228 This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
229 """
231 #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
232 pass
235 def createShoReOutput(current_loc,out_fname):
236 """
237 This format is used by the additional steps implemented in the ShoRe pipeline.
238 """
240 pass