+ added alignment reconstruction function
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import numpy
11
12 import qparser
13
14 from Utils import pprint_alignment
15
16 import palma.palma_utils as pu
17 from palma.output_formating import print_results
18
19
20 def alignment_reconstruct(current_prediction,num_exons):
21 """
22 We reconstruct the exact alignment to infer the positions of indels and the
23 sizes of the respective exons.
24 """
25
26 SpliceAlign = current_prediction['spliceAlign']
27 estAlign = current_prediction['estAlign']
28
29 dna_array = current_prediction['dna_array']
30 est_array = current_prediction['est_array']
31
32 dna_array = "".join(map(lambda x: str(x),dna_array))
33 est_array = "".join(map(lambda x: str(x),est_array))
34
35 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
36 #(gap_char, 5 nucleotides, intron_char)
37
38 # this array hold a number for each exons indicating its number of matches
39 exonIdentities = [0]*num_exons
40 exonGaps = [0]*num_exons
41
42 est_array = map(lambda x: translation[int(x)],est_array)
43 dna_array = map(lambda x: translation[int(x)],dna_array)
44
45 gaps = 0
46 identity = 0
47 idx = 0
48
49 #if num_exons > 1:
50 # pdb.set_trace()
51
52 for ridx in range(len(est_array)):
53 read_elem = est_array[ridx]
54 dna_elem = dna_array[ridx]
55
56 if ridx > 0 and est_array[ridx-1] != '_' and est_array[ridx] == '_':
57 idx += 1
58 gaps = 0
59 identity = 0
60
61 if read_elem == '_':
62 continue
63
64 if read_elem == dna_elem:
65 identity += 1
66
67 if read_elem == '-':
68 gaps += 1
69
70 exonIdentities[idx] = identity
71 exonGaps[idx] = gaps
72
73 return exonIdentities,exonGaps
74
75
76 def create_alignment_file(current_loc,out_fname):
77 import numpy
78 #qparser.parse_reads(filtered_reads)
79
80 out_fh = open(out_fname,'w+')
81
82 allPredictions = cPickle.load(open(current_loc))
83 #allPredictions = current_loc
84
85 allPositions = {}
86
87 for current_prediction in allPredictions:
88 id = current_prediction['id']
89
90 #current_ground_truth = qparser.fetch_read(id)
91 #true_cut = current_ground_truth['true_cut']
92 #seq = current_ground_truth['seq']
93 #q1 = current_ground_truth['prb']
94
95 seq = current_prediction['est']
96 dna = current_prediction['dna']
97
98 # CHECK !!!
99 q1 = 'zzz'
100
101 chromo = current_prediction['chr']
102 strand = current_prediction['strand']
103 start_pos = current_prediction['start_pos']
104 alignment = current_prediction['alignment']
105
106 DPScores = current_prediction['DPScores']
107
108 predExons = current_prediction['predExons']
109 predExons = [e+start_pos for e in predExons]
110
111 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
112 tExonSizes,tStarts, tEnds) = alignment
113
114 EST2GFF = False
115 new_line = ''
116
117 if EST2GFF:
118 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
119
120 match = predExons[0,1] - predExons[0,0]
121 if predExons.shape == (2,2):
122 match += predExons[1,1] - predExons[1,0]
123
124 mismatch = 0
125 repmatch = 0
126 Ns = 0
127 Qgapcnt = 0
128 Qgapbs = 0
129
130 Tgapcnt = 0
131 Tgapbs = 0
132
133 if predExons.shape == (2,2):
134 Tgapbs += predExons[1,0] - predExons[0,1]
135 Tgapcnt = 1
136
137 # &strand, Qname, &Qsize,
138 # &Qstart, &Qend, Tname, &Tsize,
139 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
140 #
141 # ATTENTION
142 #
143 # we enforce equal exons sizes for q and t because we are missing indel
144 # positions of the alignment
145
146 if qExonSizes[0] != tExonSizes[0]:
147 continue
148
149 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
150 Qsize = len(seq)
151 Qstart = qStart
152 Qend = qEnd
153 Tname = 'CHR%d'%chromo
154
155 start_pos -= 2
156
157 Tsize = tEnd+1 + start_pos
158 Tstart = tStart + start_pos
159 Tend = tEnd + start_pos
160 blockcnt = Tgapcnt+1
161 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
162 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
163 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
164
165 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
166 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
167 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
168 Tname, Tsize, Tstart, Tend,\
169 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
170
171 else:
172
173 num_exons = len(qExonSizes)
174 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
175
176 pp = lambda x : str(x)[1:-1].replace(' ','')
177
178 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
179 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
180 str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
181 str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
182 str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''),\
183 str(exonIdentities)[1:-1].replace(' ',''),str(exonGaps)[1:-1].replace(' ',''))
184
185 out_fh.write(new_line)
186
187 return allPositions
188
189
190 if __name__ == '__main__':
191 current_dir = sys.argv[1]
192 out_fh = sys.argv[2]
193 create_alignment_file(current_dir,out_fh)