+ added some testcases
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import numpy
11
12 import qparser
13
14 from Utils import pprint_alignment
15
16 import palma.palma_utils as pu
17 from palma.output_formating import print_results
18
19
20 def alignment_reconstruct(current_prediction,num_exons):
21 """
22 We reconstruct the exact alignment to infer the positions of indels and the
23 sizes of the respective exons.
24 """
25 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
26
27 SpliceAlign = current_prediction['spliceAlign']
28 estAlign = current_prediction['estAlign']
29
30 dna_array = current_prediction['dna_array']
31 read_array = current_prediction['read_array']
32
33 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
34 read_array = "".join(map(lambda x: translation[int(x)],read_array))
35
36 # this array holds a number for each exon indicating its number of matches
37 exonIdentities = [0]*num_exons
38 exonGaps = [0]*num_exons
39
40 gaps = 0
41 identity = 0
42 idx = 0
43
44 for ridx in range(len(read_array)):
45 read_elem = read_array[ridx]
46 dna_elem = dna_array[ridx]
47
48 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
49 idx += 1
50 gaps = 0
51 identity = 0
52
53 if read_elem == '_':
54 continue
55
56 if read_elem == dna_elem:
57 identity += 1
58
59 if read_elem == '-':
60 gaps += 1
61
62 exonIdentities[idx] = identity
63 exonGaps[idx] = gaps
64
65 return exonIdentities,exonGaps
66
67
68 def create_alignment_file(current_loc,out_fname):
69 print 'Working on %s'%current_loc
70
71 out_fh = open(out_fname,'w+')
72 allPredictions = cPickle.load(open(current_loc))
73 #allPredictions = current_loc
74
75 print 'Loaded %d examples' % len(allPredictions)
76
77 written_lines = 0
78 for pred_idx,current_prediction in enumerate(allPredictions):
79 id = current_prediction['id']
80
81 #current_ground_truth = qparser.fetch_read(id)
82 #true_cut = current_ground_truth['true_cut']
83 #seq = current_ground_truth['seq']
84 #q1 = current_ground_truth['prb']
85
86 seq = current_prediction['read']
87 dna = current_prediction['dna']
88
89 # CHECK !!!
90 q1 = 'zzz'
91
92 chromo = current_prediction['chr']
93 strand = current_prediction['strand']
94 start_pos = current_prediction['start_pos']
95 alignment = current_prediction['alignment']
96
97 DPScores = current_prediction['DPScores']
98
99 predExons = current_prediction['predExons']
100 predExons = [e+start_pos for e in predExons]
101
102 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
103 tExonSizes,tStarts, tEnds) = alignment
104
105 if len(qExonSizes) != num_exons:
106 print 'BUG exon sizes %d'%id
107 continue
108
109 EST2GFF = False
110 new_line = ''
111
112 if EST2GFF:
113 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
114
115 match = predExons[0,1] - predExons[0,0]
116 if predExons.shape == (2,2):
117 match += predExons[1,1] - predExons[1,0]
118
119 mismatch = 0
120 repmatch = 0
121 Ns = 0
122 Qgapcnt = 0
123 Qgapbs = 0
124
125 Tgapcnt = 0
126 Tgapbs = 0
127
128 if predExons.shape == (2,2):
129 Tgapbs += predExons[1,0] - predExons[0,1]
130 Tgapcnt = 1
131
132 # &strand, Qname, &Qsize,
133 # &Qstart, &Qend, Tname, &Tsize,
134 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
135 #
136 # ATTENTION
137 #
138 # we enforce equal exons sizes for q and t because we are missing indel
139 # positions of the alignment
140
141 if qExonSizes[0] != tExonSizes[0]:
142 continue
143
144 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
145 Qsize = len(seq)
146 Qstart = qStart
147 Qend = qEnd
148 Tname = 'CHR%d'%chromo
149
150 start_pos -= 2
151
152 Tsize = tEnd+1 + start_pos
153 Tstart = tStart + start_pos
154 Tend = tEnd + start_pos
155 blockcnt = Tgapcnt+1
156 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
157 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
158 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
159
160 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
161 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
162 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
163 Tname, Tsize, Tstart, Tend,\
164 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
165
166 else:
167
168 try:
169 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
170 except:
171 print 'Bug in example %d (idx: %d)' %\
172 (current_prediction['id'],pred_idx)
173 continue
174
175 pp = lambda x : str(x)[1:-1].replace(' ','')
176
177 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
178 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
179 pp(qExonSizes),pp(qStarts),\
180 pp(qEnds),pp(tExonSizes),\
181 pp(tStarts),pp(tEnds),\
182 pp(exonIdentities),pp(exonGaps))
183 written_lines += 1
184
185 out_fh.write(new_line)
186
187 print 'Wrote out %d lines' % written_lines
188
189 if __name__ == '__main__':
190 current_dir = sys.argv[1]
191 out_fh = sys.argv[2]
192 create_alignment_file(current_dir,out_fh)