3a0c0ba44bb156692df7c2ee645a2eb9bf66d96f
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
21 # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
22 translation = '-acgtn_'
24 # convenience function for pretty-printing arrays
25 pp = lambda x : str(x)[1:-1].replace(' ','')
28 def alignment_reconstruct(current_prediction,num_exons):
29 """
30 We reconstruct the exact alignment to infer the positions of indels and the
31 sizes of the respective exons.
32 """
34 SpliceAlign = current_prediction['spliceAlign']
35 estAlign = current_prediction['estAlign']
37 dna_array = current_prediction['dna_array']
40 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
43 # this array holds a number for each exon indicating its number of matches
44 exonIdentities = [0]*num_exons
45 exonGaps = [0]*num_exons
47 gaps = 0
48 identity = 0
49 idx = 0
51 for ridx in range(len(read_array)):
53 dna_elem = dna_array[ridx]
55 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
56 idx += 1
57 gaps = 0
58 identity = 0
60 if read_elem == '_':
61 continue
63 if read_elem == dna_elem:
64 identity += 1
66 if read_elem == '-':
67 gaps += 1
69 exonIdentities[idx] = identity
70 exonGaps[idx] = gaps
72 return exonIdentities,exonGaps
75 def getUniquePrediction(allPredictions):
76 """
77 Since one read can have several alignments due to several possible seed
78 positions we have to preprocess all predictions. This function returns for
79 every read only one alignment that is the highest scoring under the QPalma
80 model.
81 """
83 allUniquePredictions = {}
85 for new_prediction in allPredictions:
86 id = new_prediction['id']
87 if allUniquePredictions.has_key(id):
88 current_prediction = allUniquePredictions[id]
90 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
91 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
93 if current_a_score < new_score :
94 allUniquePredictions[id] = new_prediction
96 else:
97 allUniquePredictions[id] = new_prediction
99 return allUniquePredictions
102 def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size):
103 """
104 If we work on the negative strand we have to recalculate the indices other
105 wise we just have to add an offset i.e. the start position.
106 """
108 if strand == '-':
109 total_size = seqInfo.getFragmentSize(chromo)
110 start_pos = total_size - start_pos
112 if strand == '+':
113 starts = [int(elem) + start_pos-1 for elem in starts]
114 ends = [int(elem) + start_pos-1 for elem in ends]
115 else:
116 starts = [int(elem) + start_pos for elem in starts]
117 ends = [int(elem) + start_pos for elem in ends]
119 if strand == '-':
120 starts = [e-window_size for e in starts]
121 ends = [e-window_size for e in ends]
123 return starts,ends
126 def createAlignmentOutput(settings,chunk_fn,result_fn):
127 """
128 Given all the predictions and a result file name. This function calculates
129 all aligmnment information and returns the result in the specified format.
130 """
132 out_fh = open(result_fn,'w+')
133 allPredictions = cPickle.load(open(chunk_fn))
134 #allPredictions = current_loc
136 print 'Loaded %d examples' % len(allPredictions)
138 # fetch the data needed
139 accessWrapper = DataAccessWrapper(settings)
140 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
142 # make predictions unique
143 allUniquePredictions = getUniquePrediction(allPredictions)
145 output_format = settings['output_format']
147 written_lines = 0
148 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
150 if output_format == 'blat':
151 # blat output
152 new_line = createBlatOutput(current_prediction,settings)
154 elif output_format == 'ShoRe':
155 # ShoRe output
156 new_line = createShoReOutput(current_prediction,settings)
158 elif output_format == 'mGene':
159 # Genefinding output for mGene
160 new_line = createGenefindingOutput(current_prediction,settings)
162 else:
163 assert False
165 out_fh.write(new_line)
166 written_lines += 1
168 print 'Wrote out %d lines' % written_lines
171 def createBlatOutput(current_prediction,settings):
172 """
173 This function takes one prediction as input and returns the corresponding
174 alignment in blat format:
176 1. matches - Number of bases that match that aren't repeats
177 2. misMatches - Number of bases that don't match
178 3. repMatches - Number of bases that match but are part of repeats
179 4. nCount - Number of 'N' bases
180 5. qNumInsert - Number of inserts in query
181 6. qBaseInsert - Number of bases inserted in query
182 7. tNumInsert - Number of inserts in target
183 8. tBaseInsert - Number of bases inserted in target
184 9. strand - '+' or '-' for query strand. In mouse, second '+'or '-' is for genomic strand
185 10. qName - Query sequence name
186 11. qSize - Query sequence size
187 12. qStart - Alignment start position in query
188 13. qEnd - Alignment end position in query
189 14. tName - Target sequence name
190 15. tSize - Target sequence size
191 16. tStart - Alignment start position in target
192 17. tEnd - Alignment end position in target
193 18. blockCount - Number of blocks in the alignment
194 19. blockSizes - Comma-separated list of sizes of each block
195 20. qStarts - Comma-separated list of starting positions of each block in query
196 21. tStarts - Comma-separated list of starting positions of each block in target
198 """
200 # fetch the data needed
201 allowed_fragments = settings['allowed_fragments']
202 accessWrapper = DataAccessWrapper(settings)
203 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
205 # this is the total size of the seed region
206 window_size = 2*settings['half_window_size']
208 # now we fetch the data of the read itself
209 id = current_prediction['id']
211 seq = current_prediction['read']
212 dna = current_prediction['dna']
214 # we do not really need the quality values here
215 q1 = 'zzz'
217 chromo = current_prediction['chr']
218 strand = current_prediction['strand']
219 start_pos = current_prediction['start_pos']
220 alignment = current_prediction['alignment']
222 DPScores = current_prediction['DPScores']
224 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
225 tExonSizes,tStarts, tEnds) = alignment
227 if len(qExonSizes) != num_exons:
228 print 'BUG exon sizes %d'%id
230 new_line = ''
231 #try:
232 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
233 #except:
234 # print 'Bug in example %d (idx: %d)' %\
235 # (current_prediction['id'],pred_idx)
236 # continue
238 tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
240 ids = exonIdentities
241 ids = [int(elem) for elem in ids]
242 gaps = exonGaps
243 gaps = [int(elem) for elem in gaps]
245 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
246 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
247 pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
248 pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
250 return new_line
253 def createGenefindingOutput(current_loc,out_fname):
254 """
255 This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
256 """
258 #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
259 pass
262 def createShoReOutput(current_loc,out_fname):
263 """
264 This format is used by the additional steps implemented in the ShoRe pipeline.
265 """
267 pass