2 # -*- coding: utf-8 -*-
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
20 from qpalma
.utils
import pprint_alignment
22 import palma
.palma_utils
as pu
23 from palma
.output_formating
import print_results
25 from qpalma
.sequence_utils
import SeqSpliceInfo
,DataAccessWrapper
27 # a little auxiliary function
28 pp
= lambda x
: str(x
)[1:-1].replace(' ','')
31 def alignment_reconstruct(current_prediction
,num_exons
):
33 We reconstruct the exact alignment to infer the positions of indels and the
34 sizes of the respective exons.
37 translation
= '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
39 SpliceAlign
= current_prediction
['spliceAlign']
40 estAlign
= current_prediction
['estAlign']
42 dna_array
= current_prediction
['dna_array']
43 read_array
= current_prediction
['read_array']
45 dna_array
= "".join(map(lambda x
: translation
[int(x
)],dna_array
))
46 read_array
= "".join(map(lambda x
: translation
[int(x
)],read_array
))
48 # this array holds a number for each exon indicating its number of matches
49 exonIdentities
= [0]*num_exons
50 exonGaps
= [0]*num_exons
56 for ridx
in range(len(read_array
)):
57 read_elem
= read_array
[ridx
]
58 dna_elem
= dna_array
[ridx
]
60 if ridx
> 0 and read_array
[ridx
-1] != '_' and read_array
[ridx
] == '_':
68 if read_elem
== dna_elem
:
74 exonIdentities
[idx
] = identity
77 return exonIdentities
,exonGaps
80 def create_alignment_file(current_loc
,out_fname
):
82 out_fh
= open(out_fname
,'w+')
83 allPredictions
= cPickle
.load(open(current_loc
))
84 #allPredictions = current_loc
86 print 'Loaded %d examples' % len(allPredictions
)
88 # fetch the data needed
89 accessWrapper
= DataAccessWrapper(settings
)
90 seqInfo
= SeqSpliceInfo(accessWrapper
,settings
['allowed_fragments'])
92 # Uniqueness filtering using alignment score for reads with several
94 allUniquePredictions
= {}
96 for new_prediction
in allPredictions
:
97 id = new_prediction
['id']
98 if allUniquePredictions
.has_key(id):
99 current_prediction
= allUniquePredictions
[id]
101 current_a_score
= current_prediction
['DPScores'].flatten().tolist()[0][0]
102 new_score
= new_prediction
['DPScores'].flatten().tolist()[0][0]
104 if current_a_score
< new_score
:
105 allUniquePredictions
[id] = new_prediction
108 allUniquePredictions
[id] = new_prediction
111 for pred_idx
,current_prediction
in enumerate(allUniquePredictions
.values()):
113 id = current_prediction
['id']
115 seq
= current_prediction
['read']
116 dna
= current_prediction
['dna']
121 chromo
= current_prediction
['chr']
122 strand
= current_prediction
['strand']
123 start_pos
= current_prediction
['start_pos']
124 alignment
= current_prediction
['alignment']
126 DPScores
= current_prediction
['DPScores']
127 predExons
= current_prediction
['predExons']
128 predExons
= [e
+start_pos
for e
in predExons
]
130 (qStart
, qEnd
, tStart
, tEnd
, num_exons
, qExonSizes
, qStarts
, qEnds
,\
131 tExonSizes
,tStarts
, tEnds
) = alignment
133 if len(qExonSizes
) != num_exons
:
134 print 'BUG exon sizes %d'%id
141 predExons
= numpy
.mat(predExons
).reshape(len(predExons
)/2,2)
143 match
= predExons
[0,1] - predExons
[0,0]
144 if predExons
.shape
== (2,2):
145 match
+= predExons
[1,1] - predExons
[1,0]
156 if predExons
.shape
== (2,2):
157 Tgapbs
+= predExons
[1,0] - predExons
[0,1]
162 # we enforce equal exons sizes for q and t because we are missing indel
163 # positions of the alignment
165 if qExonSizes
[0] != tExonSizes
[0]:
168 Qname
= '%d(%2.2f)'%(id,DPScores
.tolist()[0][0])
172 Tname
= 'CHR%d'%chromo
176 Tsize
= tEnd
+1 + start_pos
177 Tstart
= tStart
+ start_pos
178 Tend
= tEnd
+ start_pos
180 exonsizes_str
= str(tExonSizes
)[1:-1].replace(' ','')
181 Qstarts_str
= str(qStarts
)[1:-1].replace(' ','')
182 Tstarts_str
= str(map(lambda x
: x
+start_pos
,tStarts
))[1:-1].replace(' ','')
184 new_line
= "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
185 % (match
, mismatch
, repmatch
, Ns
, Qgapcnt
, Qgapbs
,\
186 Tgapcnt
, Tgapbs
, strand
, Qname
, Qsize
, Qstart
, Qend
,\
187 Tname
, Tsize
, Tstart
, Tend
,\
188 blockcnt
,exonsizes_str
,Qstarts_str
,Tstarts_str
) #str(predExons))
191 exonIdentities
,exonGaps
= alignment_reconstruct(current_prediction
,num_exons
)
194 new_line
= '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
195 (id,chromo
,strand
,seq
,str(q1
)[1:-1],start_pos
,qStart
,qEnd
,tStart
,tEnd
,num_exons
,\
196 pp(qExonSizes
),pp(qStarts
),\
197 pp(qEnds
),pp(tExonSizes
),\
198 pp(tStarts
),pp(tEnds
),\
199 pp(exonIdentities
),pp(exonGaps
))
201 out_fh
.write(new_line
)
204 print 'Wrote out %d lines' % written_lines
207 def run(chunk_dir
,outfile
):
212 out_fh
= open(outfile
,'w+')
214 # fetch the data needed
215 accessWrapper
= DataAccessWrapper(settings
)
216 seqInfo
= SeqSpliceInfo(accessWrapper
,settings
['allowed_fragments'])
218 for line
in open(result_fn
):
223 start_pos
= int(sl
[5])-2
224 starts
= sl
[15].split(',')
225 ends
= sl
[16].split(',')
227 ids
= sl
[-2].split(',')
228 ids
= [int(elem
) for elem
in ids
]
229 gaps
= sl
[-1].split(',')
230 gaps
= [int(elem
) for elem
in gaps
]
235 total_size
= seqInfo
.chromo_sizes
[chromo
]
236 start_pos
= total_size
- start_pos
240 starts
= [int(elem
) + start_pos
for elem
in starts
]
241 ends
= [int(elem
) + start_pos
for elem
in ends
]
244 starts
= [e
+1 for e
in starts
]
245 ends
= [e
+1 for e
in ends
]
247 starts
= [e
-3001 for e
in starts
]
248 ends
= [e
-3001 for e
in ends
]
250 out_fh
.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo
,strand
,pp(starts
),pp(ends
),pp(ids
),pp(gaps
)))