c772b7ebbbccb6ed9907ffc56b7c4f801d1fa822
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 import cPickle
13 import math
14 import numpy
15 import os
16 import os.path
17 import pdb
18 import sys
19
20 from qpalma.utils import pprint_alignment
21
22 import palma.palma_utils as pu
23 from palma.output_formating import print_results
24
25 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
26
27
28 def alignment_reconstruct(current_prediction,num_exons):
29 """
30 We reconstruct the exact alignment to infer the positions of indels and the
31 sizes of the respective exons.
32 """
33 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
34
35 SpliceAlign = current_prediction['spliceAlign']
36 estAlign = current_prediction['estAlign']
37
38 dna_array = current_prediction['dna_array']
39 read_array = current_prediction['read_array']
40
41 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
42 read_array = "".join(map(lambda x: translation[int(x)],read_array))
43
44 # this array holds a number for each exon indicating its number of matches
45 exonIdentities = [0]*num_exons
46 exonGaps = [0]*num_exons
47
48 gaps = 0
49 identity = 0
50 idx = 0
51
52 for ridx in range(len(read_array)):
53 read_elem = read_array[ridx]
54 dna_elem = dna_array[ridx]
55
56 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
57 idx += 1
58 gaps = 0
59 identity = 0
60
61 if read_elem == '_':
62 continue
63
64 if read_elem == dna_elem:
65 identity += 1
66
67 if read_elem == '-':
68 gaps += 1
69
70 exonIdentities[idx] = identity
71 exonGaps[idx] = gaps
72
73 return exonIdentities,exonGaps
74
75
76 def create_alignment_file(current_loc,out_fname):
77
78 out_fh = open(out_fname,'w+')
79 allPredictions = cPickle.load(open(current_loc))
80 #allPredictions = current_loc
81
82 print 'Loaded %d examples' % len(allPredictions)
83
84 # fetch the data needed
85 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
86 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
87 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
88
89 g_fmt = 'chr%d.dna.flat'
90 s_fmt = 'contig_%d%s'
91
92 num_chromo = 6
93
94 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
95 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
96
97 # Uniqueness filtering using alignment score for reads with several
98 # alignments
99 allUniquePredictions = {}
100
101 for new_prediction in allPredictions:
102 id = new_prediction['id']
103 if allUniquePredictions.has_key(id):
104 current_prediction = allUniquePredictions[id]
105
106 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
107 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
108
109 if current_a_score < new_score :
110 allUniquePredictions[id] = new_prediction
111
112 else:
113 allUniquePredictions[id] = new_prediction
114
115 written_lines = 0
116 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
117
118 id = current_prediction['id']
119
120 seq = current_prediction['read']
121 dna = current_prediction['dna']
122
123 # CHECK !!!
124 q1 = 'zzz'
125
126 chromo = current_prediction['chr']
127 strand = current_prediction['strand']
128 start_pos = current_prediction['start_pos']
129 alignment = current_prediction['alignment']
130
131 DPScores = current_prediction['DPScores']
132 predExons = current_prediction['predExons']
133 predExons = [e+start_pos for e in predExons]
134
135 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
136 tExonSizes,tStarts, tEnds) = alignment
137
138 if len(qExonSizes) != num_exons:
139 print 'BUG exon sizes %d'%id
140 continue
141
142 EST2GFF = False
143 new_line = ''
144
145 if EST2GFF:
146 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
147
148 match = predExons[0,1] - predExons[0,0]
149 if predExons.shape == (2,2):
150 match += predExons[1,1] - predExons[1,0]
151
152 mismatch = 0
153 repmatch = 0
154 Ns = 0
155 Qgapcnt = 0
156 Qgapbs = 0
157
158 Tgapcnt = 0
159 Tgapbs = 0
160
161 if predExons.shape == (2,2):
162 Tgapbs += predExons[1,0] - predExons[0,1]
163 Tgapcnt = 1
164
165 # &strand, Qname, &Qsize,
166 # &Qstart, &Qend, Tname, &Tsize,
167 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
168 #
169 # ATTENTION
170 #
171 # we enforce equal exons sizes for q and t because we are missing indel
172 # positions of the alignment
173
174 if qExonSizes[0] != tExonSizes[0]:
175 continue
176
177 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
178 Qsize = len(seq)
179 Qstart = qStart
180 Qend = qEnd
181 Tname = 'CHR%d'%chromo
182
183 start_pos -= 2
184
185 Tsize = tEnd+1 + start_pos
186 Tstart = tStart + start_pos
187 Tend = tEnd + start_pos
188 blockcnt = Tgapcnt+1
189 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
190 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
191 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
192
193 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
194 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
195 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
196 Tname, Tsize, Tstart, Tend,\
197 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
198
199 else:
200 #try:
201 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
202 #except:
203 # print 'Bug in example %d (idx: %d)' %\
204 # (current_prediction['id'],pred_idx)
205 # continue
206
207 #t_size = tEnd - tStart
208 #read_size = 38
209 #if strand == '-':
210 # total_size = seqInfo.chromo_sizes[chromo]
211
212 # start_pos = total_size - start_pos
213
214 # qExonSizes.reverse()
215 # qStarts = [read_size-e for e in qEnds]
216 # qStarts.reverse()
217 # qEnds = [read_size-e for e in qStarts]
218 # qEnds.reverse()
219
220 # tExonSizes.reverse()
221 # tStarts = [t_size-e for e in tEnds]
222 # tStarts.reverse()
223 # tEnds = [t_size-e for e in tStarts]
224 # tEnds.reverse()
225
226 # exonIdentities.reverse()
227 # exonGaps.reverse()
228
229 pp = lambda x : str(x)[1:-1].replace(' ','')
230
231 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
232 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
233 pp(qExonSizes),pp(qStarts),\
234 pp(qEnds),pp(tExonSizes),\
235 pp(tStarts),pp(tEnds),\
236 pp(exonIdentities),pp(exonGaps))
237
238 out_fh.write(new_line)
239 written_lines += 1
240
241 print 'Wrote out %d lines' % written_lines
242
243
244 if __name__ == '__main__':
245 current_dir = sys.argv[1]
246 out_fh = sys.argv[2]
247 create_alignment_file(current_dir,out_fh)