+ removed some obsolete dependencies
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import math
6 import numpy
7 import os
8 import os.path
9 import pdb
10 import sys
11
12 from Utils import pprint_alignment
13
14 import palma.palma_utils as pu
15 from palma.output_formating import print_results
16
17 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
18
19
20 def alignment_reconstruct(current_prediction,num_exons):
21 """
22 We reconstruct the exact alignment to infer the positions of indels and the
23 sizes of the respective exons.
24 """
25 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
26
27 SpliceAlign = current_prediction['spliceAlign']
28 estAlign = current_prediction['estAlign']
29
30 dna_array = current_prediction['dna_array']
31 read_array = current_prediction['read_array']
32
33 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
34 read_array = "".join(map(lambda x: translation[int(x)],read_array))
35
36 # this array holds a number for each exon indicating its number of matches
37 exonIdentities = [0]*num_exons
38 exonGaps = [0]*num_exons
39
40 gaps = 0
41 identity = 0
42 idx = 0
43
44 for ridx in range(len(read_array)):
45 read_elem = read_array[ridx]
46 dna_elem = dna_array[ridx]
47
48 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
49 idx += 1
50 gaps = 0
51 identity = 0
52
53 if read_elem == '_':
54 continue
55
56 if read_elem == dna_elem:
57 identity += 1
58
59 if read_elem == '-':
60 gaps += 1
61
62 exonIdentities[idx] = identity
63 exonGaps[idx] = gaps
64
65 return exonIdentities,exonGaps
66
67
68 def create_alignment_file(current_loc,out_fname):
69
70 out_fh = open(out_fname,'w+')
71 allPredictions = cPickle.load(open(current_loc))
72 #allPredictions = current_loc
73
74 print 'Loaded %d examples' % len(allPredictions)
75
76 # fetch the data needed
77 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
78 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
79 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
80
81 g_fmt = 'chr%d.dna.flat'
82 s_fmt = 'contig_%d%s'
83
84 num_chromo = 6
85
86 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
87 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
88
89 # Uniqueness filtering using alignment score for reads with several
90 # alignments
91 allUniquePredictions = {}
92
93 for new_prediction in allPredictions:
94 id = new_prediction['id']
95 if allUniquePredictions.has_key(id):
96 current_prediction = allUniquePredictions[id]
97
98 current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
99 new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
100
101 if current_a_score < new_score :
102 allUniquePredictions[id] = new_prediction
103
104 else:
105 allUniquePredictions[id] = new_prediction
106
107 written_lines = 0
108 for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
109
110 id = current_prediction['id']
111
112 seq = current_prediction['read']
113 dna = current_prediction['dna']
114
115 # CHECK !!!
116 q1 = 'zzz'
117
118 chromo = current_prediction['chr']
119 strand = current_prediction['strand']
120 start_pos = current_prediction['start_pos']
121 alignment = current_prediction['alignment']
122
123 DPScores = current_prediction['DPScores']
124 predExons = current_prediction['predExons']
125 predExons = [e+start_pos for e in predExons]
126
127 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
128 tExonSizes,tStarts, tEnds) = alignment
129
130 if len(qExonSizes) != num_exons:
131 print 'BUG exon sizes %d'%id
132 continue
133
134 EST2GFF = False
135 new_line = ''
136
137 if EST2GFF:
138 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
139
140 match = predExons[0,1] - predExons[0,0]
141 if predExons.shape == (2,2):
142 match += predExons[1,1] - predExons[1,0]
143
144 mismatch = 0
145 repmatch = 0
146 Ns = 0
147 Qgapcnt = 0
148 Qgapbs = 0
149
150 Tgapcnt = 0
151 Tgapbs = 0
152
153 if predExons.shape == (2,2):
154 Tgapbs += predExons[1,0] - predExons[0,1]
155 Tgapcnt = 1
156
157 # &strand, Qname, &Qsize,
158 # &Qstart, &Qend, Tname, &Tsize,
159 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
160 #
161 # ATTENTION
162 #
163 # we enforce equal exons sizes for q and t because we are missing indel
164 # positions of the alignment
165
166 if qExonSizes[0] != tExonSizes[0]:
167 continue
168
169 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
170 Qsize = len(seq)
171 Qstart = qStart
172 Qend = qEnd
173 Tname = 'CHR%d'%chromo
174
175 start_pos -= 2
176
177 Tsize = tEnd+1 + start_pos
178 Tstart = tStart + start_pos
179 Tend = tEnd + start_pos
180 blockcnt = Tgapcnt+1
181 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
182 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
183 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
184
185 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
186 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
187 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
188 Tname, Tsize, Tstart, Tend,\
189 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
190
191 else:
192 #try:
193 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
194 #except:
195 # print 'Bug in example %d (idx: %d)' %\
196 # (current_prediction['id'],pred_idx)
197 # continue
198
199 #t_size = tEnd - tStart
200 #read_size = 38
201 #if strand == '-':
202 # total_size = seqInfo.chromo_sizes[chromo]
203
204 # start_pos = total_size - start_pos
205
206 # qExonSizes.reverse()
207 # qStarts = [read_size-e for e in qEnds]
208 # qStarts.reverse()
209 # qEnds = [read_size-e for e in qStarts]
210 # qEnds.reverse()
211
212 # tExonSizes.reverse()
213 # tStarts = [t_size-e for e in tEnds]
214 # tStarts.reverse()
215 # tEnds = [t_size-e for e in tStarts]
216 # tEnds.reverse()
217
218 # exonIdentities.reverse()
219 # exonGaps.reverse()
220
221 pp = lambda x : str(x)[1:-1].replace(' ','')
222
223 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
224 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
225 pp(qExonSizes),pp(qStarts),\
226 pp(qEnds),pp(tExonSizes),\
227 pp(tStarts),pp(tEnds),\
228 pp(exonIdentities),pp(exonGaps))
229
230 out_fh.write(new_line)
231 written_lines += 1
232
233 print 'Wrote out %d lines' % written_lines
234
235
236 if __name__ == '__main__':
237 current_dir = sys.argv[1]
238 out_fh = sys.argv[2]
239 create_alignment_file(current_dir,out_fh)