1 # This program is free software; you can redistribute it and/or modify
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
21 # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
22 translation = '-acgtn_'
24 # convenience function for pretty-printing arrays
25 pp = lambda x : str(x)[1:-1].replace(' ',' ')
28 def alignment_reconstruct(current_prediction,num_exons):
29 """
30 We reconstruct the exact alignment to infer the positions of indels and the
31 sizes of the respective exons.
32 """
34 SpliceAlign = current_prediction['spliceAlign']
35 estAlign = current_prediction['estAlign']
37 dna_array = current_prediction['dna_array']
40 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
43 # this array holds a number for each exon indicating its number of matches
44 exonIdentities = [0]*num_exons
45 exonGaps = [0]*num_exons
47 gaps = 0
48 identity = 0
49 idx = 0
53 dna_elem = dna_array[ridx]
55 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
56 idx += 1
57 gaps = 0
58 identity = 0
61 continue
64 identity += 1
67 gaps += 1
69 exonIdentities[idx] = identity
70 exonGaps[idx] = gaps
72 return exonIdentities,exonGaps
75 def getUniquePrediction(allPredictions):
76 """
77 Since one read can have several alignments due to several possible seed
78 positions we have to preprocess all predictions. This function returns for
79 every read only one alignment that is the highest scoring under the QPalma
80 model.
81 """
83 allUniquePredictions = {}
85 for new_prediction in allPredictions:
86 id = new_prediction['id']
87 if allUniquePredictions.has_key(id):
88 current_prediction = allUniquePredictions[id]
90 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
91 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
93 if current_a_score < new_score :
94 allUniquePredictions[id] = new_prediction
96 else:
97 allUniquePredictions[id] = new_prediction
99 return allUniquePredictions
102 def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size):
103 """
104 If we work on the negative strand we have to recalculate the indices other
105 wise we just have to add an offset i.e. the start position.
106 """
108 print 'window_size is %d' % window_size
109 print start_pos
110 print starts
111 print ends
113 if strand == '-':
114 _ends = [window_size - int(elem) + 2 for elem in starts]
115 _starts = [window_size - int(elem) + 2 for elem in ends]
116 starts = _starts
117 ends = _ends
119 print start_pos
120 print starts
121 print ends
123 starts = [int(elem) + start_pos-1 for elem in starts]
124 ends = [int(elem) + start_pos-1 for elem in ends]
126 return starts,ends
129 def createAlignmentOutput(settings,chunk_fn,result_fn):
130 """
131 Given all the predictions and a result file name. This function calculates
132 all aligmnment information and returns the result in the specified format.
133 """
135 out_fh = open(result_fn,'w+')
137 #allPredictions = current_loc
139 print 'Loaded %d examples' % len(allPredictions)
141 # fetch the data needed
142 accessWrapper = DataAccessWrapper(settings)
143 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
145 # make predictions unique
146 allUniquePredictions = getUniquePrediction(allPredictions)
148 output_format = settings['output_format']
150 written_lines = 0
151 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
153 if output_format == 'blat':
154 # blat output
155 new_line = createBlatOutput(current_prediction,settings)
157 elif output_format == 'ShoRe':
158 # ShoRe output
159 new_line = createShoReOutput(current_prediction,settings)
161 elif output_format == 'mGene':
162 # Genefinding output for mGene
163 new_line = createGenefindingOutput(current_prediction,settings)
165 else:
166 assert False
168 out_fh.write(new_line)
169 written_lines += 1
171 print 'Wrote out %d lines' % written_lines
174 def createBlatOutput(current_prediction,settings):
175 """
176 This function takes one prediction as input and returns the corresponding
177 alignment in blat format:
179 1. matches - Number of bases that match that aren't repeats
180 2. misMatches - Number of bases that don't match
181 3. repMatches - Number of bases that match but are part of repeats
182 4. nCount - Number of 'N' bases
183 5. qNumInsert - Number of inserts in query
184 6. qBaseInsert - Number of bases inserted in query
185 7. tNumInsert - Number of inserts in target
186 8. tBaseInsert - Number of bases inserted in target
187 9. strand - '+' or '-' for query strand. In mouse, second '+'or '-' is for genomic strand
188 10. qName - Query sequence name
189 11. qSize - Query sequence size
190 12. qStart - Alignment start position in query
191 13. qEnd - Alignment end position in query
192 14. tName - Target sequence name
193 15. tSize - Target sequence size
194 16. tStart - Alignment start position in target
195 17. tEnd - Alignment end position in target
196 18. blockCount - Number of blocks in the alignment
197 19. blockSizes - Comma-separated list of sizes of each block
198 20. qStarts - Comma-separated list of starting positions of each block in query
199 21. tStarts - Comma-separated list of starting positions of each block in target
201 """
203 # fetch the data needed
204 allowed_fragments = settings['allowed_fragments']
205 accessWrapper = DataAccessWrapper(settings)
206 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
208 # this is the total size of the seed region
209 window_size = 2*settings['half_window_size']
211 # now we fetch the data of the read itself
212 id = current_prediction['id']
215 dna = current_prediction['dna']
217 # we do not really need the quality values here
218 q1 = 'zzz'
220 chromo = current_prediction['chr']
221 strand = current_prediction['strand']
222 start_pos = current_prediction['start_pos']
223 alignment = current_prediction['alignment']
225 DPScores = current_prediction['DPScores']
227 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
228 tExonSizes,tStarts, tEnds) = alignment
230 if len(qExonSizes) != num_exons:
231 print 'BUG exon sizes %d'%id
233 new_line = ''
234 #try:
235 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
236 #except:
237 # print 'Bug in example %d (idx: %d)' %\
238 # (current_prediction['id'],pred_idx)
239 # continue
241 tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
243 ids = exonIdentities
244 ids = [int(elem) for elem in ids]
245 gaps = exonGaps
246 gaps = [int(elem) for elem in gaps]
248 new_line = '%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
249 (id,chromo,strand,seq,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
250 pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
251 pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
253 return new_line
256 def createGenefindingOutput(current_loc,out_fname):
257 """
258 This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
259 """
261 #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
262 pass
265 def createShoReOutput(current_loc,out_fname):
266 """
267 This format is used by the additional steps implemented in the ShoRe pipeline.
268 """
270 pass