1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
9 import cPickle
10 import math
11 import numpy
12 import os
13 import os.path
14 import pdb
15 import sys
17 from qpalma.utils import pprint_alignment
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
21 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
24 def alignment_reconstruct(current_prediction,num_exons):
25 """
26 We reconstruct the exact alignment to infer the positions of indels and the
27 sizes of the respective exons.
28 """
30 SpliceAlign = current_prediction['spliceAlign']
31 estAlign = current_prediction['estAlign']
33 dna_array = current_prediction['dna_array']
36 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
39 # this array holds a number for each exon indicating its number of matches
40 exonIdentities = [0]*num_exons
41 exonGaps = [0]*num_exons
43 gaps = 0
44 identity = 0
45 idx = 0
47 for ridx in range(len(read_array)):
49 dna_elem = dna_array[ridx]
51 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
52 idx += 1
53 gaps = 0
54 identity = 0
56 if read_elem == '_':
57 continue
59 if read_elem == dna_elem:
60 identity += 1
62 if read_elem == '-':
63 gaps += 1
65 exonIdentities[idx] = identity
66 exonGaps[idx] = gaps
68 return exonIdentities,exonGaps
71 def getUniquePrediction(allPredictions):
72 """
73 Since one read can have several alignments due to several possible seed
74 positions we have to preprocess all prediction. This function return for
75 every read only one alignment that is the highest scoring under the QPalma
76 model.
77 """
79 allUniquePredictions = {}
81 for new_prediction in allPredictions:
82 id = new_prediction['id']
83 if allUniquePredictions.has_key(id):
84 current_prediction = allUniquePredictions[id]
86 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
87 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
89 if current_a_score < new_score :
90 allUniquePredictions[id] = new_prediction
92 else:
93 allUniquePredictions[id] = new_prediction
95 return allUniquePredictions
98 def createOutput(current_loc,out_fname,output_format):
99 """
100 Given all the predictions and a result file name. This function calculates
101 all aligmnment information and returns the result in the specified format.
102 """
104 out_fh = open(out_fname,'w+')
105 allPredictions = cPickle.load(open(current_loc))
106 #allPredictions = current_loc
108 print 'Loaded %d examples' % len(allPredictions)
110 # fetch the data needed
111 accessWrapper = DataAccessWrapper(settings)
112 seqInfo = SeqSpliceInfo(accessWrapper,,settings['allowed_fragments'])
114 # make predictions unique
115 allUniquePredictions = getUniquePrediction(allPredictions)
117 written_lines = 0
118 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
120 if output_format == 'BLAT':
121 # BLAT output
122 new_line = createBlatOutput(current_prediction)
124 elif output_format == 'ShoRe':
125 # ShoRe output
126 new_line = createShoReOutput(current_prediction)
128 elif output_format == 'mGene':
129 # Genefinding output for mGene
130 new_line = createGenefindingOutput(current_prediction)
132 else:
133 assert False
135 out_fh.write(new_line)
136 written_lines += 1
138 print 'Wrote out %d lines' % written_lines
141 def createBlatOutput(current_prediction):
142 """
143 This function takes one prediction as input and returns the corresponding
144 alignment in BLAT format.
145 """
147 id = current_prediction['id']
149 seq = current_prediction['read']
150 dna = current_prediction['dna']
152 # we do not really need the quality values here
153 q1 = 'zzz'
155 chromo = current_prediction['chr']
156 strand = current_prediction['strand']
157 start_pos = current_prediction['start_pos']
158 alignment = current_prediction['alignment']
160 DPScores = current_prediction['DPScores']
161 predExons = current_prediction['predExons']
162 predExons = [e+start_pos for e in predExons]
164 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
165 tExonSizes,tStarts, tEnds) = alignment
167 if len(qExonSizes) != num_exons:
168 print 'BUG exon sizes %d'%id
169 continue
171 EST2GFF = False
172 new_line = ''
174 if EST2GFF:
175 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
177 match = predExons[0,1] - predExons[0,0]
178 if predExons.shape == (2,2):
179 match += predExons[1,1] - predExons[1,0]
181 mismatch = 0
182 repmatch = 0
183 Ns = 0
184 Qgapcnt = 0
185 Qgapbs = 0
187 Tgapcnt = 0
188 Tgapbs = 0
190 if predExons.shape == (2,2):
191 Tgapbs += predExons[1,0] - predExons[0,1]
192 Tgapcnt = 1
194 # &strand, Qname, &Qsize,
195 # &Qstart, &Qend, Tname, &Tsize,
196 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
197 #
198 # ATTENTION
199 #
200 # we enforce equal exons sizes for q and t because we are missing indel
201 # positions of the alignment
203 if qExonSizes[0] != tExonSizes[0]:
204 continue
206 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
207 Qsize = len(seq)
208 Qstart = qStart
209 Qend = qEnd
210 Tname = 'CHR%d'%chromo
212 start_pos -= 2
214 Tsize = tEnd+1 + start_pos
215 Tstart = tStart + start_pos
216 Tend = tEnd + start_pos
217 blockcnt = Tgapcnt+1
218 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
219 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
220 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
222 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
223 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
224 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
225 Tname, Tsize, Tstart, Tend,\
226 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
228 else:
229 #try:
230 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
231 #except:
232 # print 'Bug in example %d (idx: %d)' %\
233 # (current_prediction['id'],pred_idx)
234 # continue
236 #t_size = tEnd - tStart
237 #read_size = 38
238 #if strand == '-':
239 # total_size = seqInfo.chromo_sizes[chromo]
241 # start_pos = total_size - start_pos
243 # qExonSizes.reverse()
244 # qStarts = [read_size-e for e in qEnds]
245 # qStarts.reverse()
246 # qEnds = [read_size-e for e in qStarts]
247 # qEnds.reverse()
249 # tExonSizes.reverse()
250 # tStarts = [t_size-e for e in tEnds]
251 # tStarts.reverse()
252 # tEnds = [t_size-e for e in tStarts]
253 # tEnds.reverse()
255 # exonIdentities.reverse()
256 # exonGaps.reverse()
258 pp = lambda x : str(x)[1:-1].replace(' ','')
260 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
261 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
262 pp(qExonSizes),pp(qStarts),\
263 pp(qEnds),pp(tExonSizes),\
264 pp(tStarts),pp(tEnds),\
265 pp(exonIdentities),pp(exonGaps))
267 return new_line
270 def createGenefindingOutput(current_loc,out_fname):
271 """
272 """
273 pass
276 def createShoReOutput(current_loc,out_fname):
277 """
278 """
279 pass