+ added some text and references to the documentation
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 import cPickle
13 import math
14 import numpy
15 import os
16 import os.path
17 import pdb
18 import sys
19
20 from qpalma.utils import pprint_alignment
21
22 import palma.palma_utils as pu
23 from palma.output_formating import print_results
24
25 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
26
27 # a little auxiliary function
28 pp = lambda x : str(x)[1:-1].replace(' ','')
29
30
31 def alignment_reconstruct(current_prediction,num_exons):
32 """
33 We reconstruct the exact alignment to infer the positions of indels and the
34 sizes of the respective exons.
35 """
36
37 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
38
39 SpliceAlign = current_prediction['spliceAlign']
40 estAlign = current_prediction['estAlign']
41
42 dna_array = current_prediction['dna_array']
43 read_array = current_prediction['read_array']
44
45 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
46 read_array = "".join(map(lambda x: translation[int(x)],read_array))
47
48 # this array holds a number for each exon indicating its number of matches
49 exonIdentities = [0]*num_exons
50 exonGaps = [0]*num_exons
51
52 gaps = 0
53 identity = 0
54 idx = 0
55
56 for ridx in range(len(read_array)):
57 read_elem = read_array[ridx]
58 dna_elem = dna_array[ridx]
59
60 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
61 idx += 1
62 gaps = 0
63 identity = 0
64
65 if read_elem == '_':
66 continue
67
68 if read_elem == dna_elem:
69 identity += 1
70
71 if read_elem == '-':
72 gaps += 1
73
74 exonIdentities[idx] = identity
75 exonGaps[idx] = gaps
76
77 return exonIdentities,exonGaps
78
79
80 def create_alignment_file(current_loc,out_fname):
81
82 out_fh = open(out_fname,'w+')
83 allPredictions = cPickle.load(open(current_loc))
84 #allPredictions = current_loc
85
86 print 'Loaded %d examples' % len(allPredictions)
87
88 # fetch the data needed
89 accessWrapper = DataAccessWrapper(settings)
90 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
91
92 # Uniqueness filtering using alignment score for reads with several
93 # alignments
94 allUniquePredictions = {}
95
96 for new_prediction in allPredictions:
97 id = new_prediction['id']
98 if allUniquePredictions.has_key(id):
99 current_prediction = allUniquePredictions[id]
100
101 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
102 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
103
104 if current_a_score < new_score :
105 allUniquePredictions[id] = new_prediction
106
107 else:
108 allUniquePredictions[id] = new_prediction
109
110 written_lines = 0
111 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
112
113 id = current_prediction['id']
114
115 seq = current_prediction['read']
116 dna = current_prediction['dna']
117
118 # CHECK !!!
119 q1 = 'zzz'
120
121 chromo = current_prediction['chr']
122 strand = current_prediction['strand']
123 start_pos = current_prediction['start_pos']
124 alignment = current_prediction['alignment']
125
126 DPScores = current_prediction['DPScores']
127 predExons = current_prediction['predExons']
128 predExons = [e+start_pos for e in predExons]
129
130 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
131 tExonSizes,tStarts, tEnds) = alignment
132
133 if len(qExonSizes) != num_exons:
134 print 'BUG exon sizes %d'%id
135 continue
136
137 EST2GFF = False
138 new_line = ''
139
140 if EST2GFF:
141 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
142
143 match = predExons[0,1] - predExons[0,0]
144 if predExons.shape == (2,2):
145 match += predExons[1,1] - predExons[1,0]
146
147 mismatch = 0
148 repmatch = 0
149 Ns = 0
150 Qgapcnt = 0
151 Qgapbs = 0
152
153 Tgapcnt = 0
154 Tgapbs = 0
155
156 if predExons.shape == (2,2):
157 Tgapbs += predExons[1,0] - predExons[0,1]
158 Tgapcnt = 1
159
160 # ATTENTION
161 #
162 # we enforce equal exons sizes for q and t because we are missing indel
163 # positions of the alignment
164
165 if qExonSizes[0] != tExonSizes[0]:
166 continue
167
168 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
169 Qsize = len(seq)
170 Qstart = qStart
171 Qend = qEnd
172 Tname = 'CHR%d'%chromo
173
174 start_pos -= 2
175
176 Tsize = tEnd+1 + start_pos
177 Tstart = tStart + start_pos
178 Tend = tEnd + start_pos
179 blockcnt = Tgapcnt+1
180 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
181 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
182 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
183
184 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
185 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
186 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
187 Tname, Tsize, Tstart, Tend,\
188 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
189
190 else:
191 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
192
193
194 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
195 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
196 pp(qExonSizes),pp(qStarts),\
197 pp(qEnds),pp(tExonSizes),\
198 pp(tStarts),pp(tEnds),\
199 pp(exonIdentities),pp(exonGaps))
200
201 out_fh.write(new_line)
202 written_lines += 1
203
204 print 'Wrote out %d lines' % written_lines
205
206
207 def run(chunk_dir,outfile):
208 """
209 The run...
210 """
211
212 out_fh = open(outfile,'w+')
213
214 # fetch the data needed
215 accessWrapper = DataAccessWrapper(settings)
216 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
217
218
219 for line in open(result_fn):
220 sl = line.split()
221
222 chromo = int(sl[1])
223 strand = sl[2]
224 start_pos = int(sl[5])-2
225 starts = sl[15].split(',')
226 ends = sl[16].split(',')
227
228 ids = sl[-2].split(',')
229 ids = [int(elem) for elem in ids]
230 gaps = sl[-1].split(',')
231 gaps = [int(elem) for elem in gaps]
232
233 if strand == '+':
234 pass
235 elif strand == '-':
236 total_size = seqInfo.chromo_sizes[chromo]
237 start_pos = total_size - start_pos
238 else:
239 assert False
240
241 starts = [int(elem) + start_pos for elem in starts]
242 ends = [int(elem) + start_pos for elem in ends]
243
244 if strand == '+':
245 starts = [e+1 for e in starts]
246 ends = [e+1 for e in ends]
247 else:
248 starts = [e-3001 for e in starts]
249 ends = [e-3001 for e in ends]
250
251 out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))