1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import numpy
12 import qparser
14 from Utils import pprint_alignment
16 import palma.palma_utils as pu
17 from palma.output_formating import print_results
20 def alignment_reconstruct(current_prediction,num_exons):
21 """
22 We reconstruct the exact alignment to infer the positions of indels and the
23 sizes of the respective exons.
24 """
25 translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
27 SpliceAlign = current_prediction['spliceAlign']
28 estAlign = current_prediction['estAlign']
30 dna_array = current_prediction['dna_array']
33 dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
36 # this array holds a number for each exon indicating its number of matches
37 exonIdentities = [0]*num_exons
38 exonGaps = [0]*num_exons
40 gaps = 0
41 identity = 0
42 idx = 0
46 dna_elem = dna_array[ridx]
48 if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
49 idx += 1
50 gaps = 0
51 identity = 0
54 continue
57 identity += 1
60 gaps += 1
62 exonIdentities[idx] = identity
63 exonGaps[idx] = gaps
65 return exonIdentities,exonGaps
68 def create_alignment_file(current_loc,out_fname):
69 print 'Working on %s'%current_loc
71 out_fh = open(out_fname,'w+')
73 #allPredictions = current_loc
75 print 'Loaded %d examples' % len(allPredictions)
77 written_lines = 0
78 for pred_idx,current_prediction in enumerate(allPredictions):
79 id = current_prediction['id']
82 #true_cut = current_ground_truth['true_cut']
83 #seq = current_ground_truth['seq']
84 #q1 = current_ground_truth['prb']
87 dna = current_prediction['dna']
89 # CHECK !!!
90 q1 = 'zzz'
92 chromo = current_prediction['chr']
93 strand = current_prediction['strand']
94 start_pos = current_prediction['start_pos']
95 alignment = current_prediction['alignment']
97 DPScores = current_prediction['DPScores']
99 predExons = current_prediction['predExons']
100 predExons = [e+start_pos for e in predExons]
102 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
103 tExonSizes,tStarts, tEnds) = alignment
105 if len(qExonSizes) != num_exons:
106 print 'BUG exon sizes %d'%id
107 continue
109 EST2GFF = False
110 new_line = ''
112 if EST2GFF:
113 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
115 match = predExons[0,1] - predExons[0,0]
116 if predExons.shape == (2,2):
117 match += predExons[1,1] - predExons[1,0]
119 mismatch = 0
120 repmatch = 0
121 Ns = 0
122 Qgapcnt = 0
123 Qgapbs = 0
125 Tgapcnt = 0
126 Tgapbs = 0
128 if predExons.shape == (2,2):
129 Tgapbs += predExons[1,0] - predExons[0,1]
130 Tgapcnt = 1
132 # &strand, Qname, &Qsize,
133 # &Qstart, &Qend, Tname, &Tsize,
134 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
135 #
136 # ATTENTION
137 #
138 # we enforce equal exons sizes for q and t because we are missing indel
139 # positions of the alignment
141 if qExonSizes[0] != tExonSizes[0]:
142 continue
144 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
145 Qsize = len(seq)
146 Qstart = qStart
147 Qend = qEnd
148 Tname = 'CHR%d'%chromo
150 start_pos -= 2
152 Tsize = tEnd+1 + start_pos
153 Tstart = tStart + start_pos
154 Tend = tEnd + start_pos
155 blockcnt = Tgapcnt+1
156 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
157 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
158 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
160 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
161 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
162 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
163 Tname, Tsize, Tstart, Tend,\
164 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
166 else:
168 try:
169 exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
170 except:
171 print 'Bug in example %d (idx: %d)' %\
172 (current_prediction['id'],pred_idx)
173 continue
175 pp = lambda x : str(x)[1:-1].replace(' ','')
177 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
178 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
179 pp(qExonSizes),pp(qStarts),\
180 pp(qEnds),pp(tExonSizes),\
181 pp(tStarts),pp(tEnds),\
182 pp(exonIdentities),pp(exonGaps))
183 written_lines += 1
185 out_fh.write(new_line)
187 print 'Wrote out %d lines' % written_lines
189 if __name__ == '__main__':
190 current_dir = sys.argv[1]
191 out_fh = sys.argv[2]
192 create_alignment_file(current_dir,out_fh)