2 # -*- coding: utf-8 -*-
7 from numpy
.matlib
import mat
,zeros
,ones
,inf
9 import palma
.palma_utils
as pu
10 from palma
.output_formating
import print_results
12 # some useful constants
14 extended_alphabet
= ['-','a','c','g','t','n','[',']']
15 alphabet
= ['-','a','c','g','t','n']
17 alignment_alphabet
= ['-','a','c','g','t','n','z']
20 def print_prediction(example
):
21 dna_array
= example
['dna_array']
22 read_array
= example
['read_array']
24 dna
= map(lambda x
: alignment_alphabet
[x
],dna_array
)
25 read
= map(lambda x
: alignment_alphabet
[x
],read_array
)
27 spliceAlign
= example
['spliceAlign']
28 estAlign
= example
['estAlign']
30 line1
,line2
,line3
= pprint_alignment(spliceAlign
, estAlign
, dna
, read
)
32 result
= '%s\n%s\n%s' %(line1
,line2
,line3
)
35 def calc_info(acc
,don
,exons
,qualities
):
42 min_score
= min(min_score
,score
)
43 max_score
= max(max_score
,score
)
45 print 'Acceptors max/min: %f / %f' % (max_score
,min_score
)
53 min_score
= min(min_score
,score
)
54 max_score
= max(max_score
,score
)
56 print 'Donor max/min: %f / %f' % (max_score
,min_score
)
63 intron_len
= int(elem
[1,0] - elem
[0,1])
64 mean_intron_len
+= intron_len
65 min_score
= min(min_score
,intron_len
)
66 max_score
= max(max_score
,intron_len
)
68 mean_intron_len
/= 1.0*len(exons
)
69 print 'Intron length max/min: %d / %d mean length %f' % (max_score
,min_score
,mean_intron_len
)
75 for qVec
in qualities
:
77 min_score
= min(min_score
,elem
)
78 max_score
= max(max_score
,elem
)
80 print 'Quality values max/min: %d / %d mean' % (max_score
,min_score
)
83 def calc_stat(Acceptor
,Donor
,Exons
):
94 for jdx
in range(len(Acceptors
)):
95 currentExons
= Exons
[jdx
]
96 currentAcceptor
= Acceptors
[jdx
]
97 currentAcceptor
= currentAcceptor
[1:]
98 currentAcceptor
.append(-inf
)
99 currentDonor
= Donors
[jdx
]
101 for idx
,elem
in enumerate(currentAcceptor
):
102 if idx
== (int(currentExons
[1,0])-1): # acceptor site
103 acc_vec_pos
.append(elem
)
105 acc_vec_neg
.append(elem
)
107 for idx
,elem
in enumerate(currentDonor
):
108 if idx
== (int(currentExons
[0,1])): # donor site
109 don_vec_pos
.append(elem
)
111 don_vec_neg
.append(elem
)
113 acc_pos
= [elem
for elem
in acc_vec_pos
if elem
!= -inf
]
114 acc_neg
= [elem
for elem
in acc_vec_neg
if elem
!= -inf
]
115 don_pos
= [elem
for elem
in don_vec_pos
if elem
!= -inf
]
116 don_neg
= [elem
for elem
in don_vec_neg
if elem
!= -inf
]
118 for idx
in range(len(Sequences
)):
119 acc
= [elem
for elem
in Acceptors
[idx
] if elem
!= -inf
]
120 maxAcc
= max(max(acc
),maxAcc
)
121 minAcc
= min(min(acc
),minAcc
)
123 don
= [elem
for elem
in Donors
[idx
] if elem
!= -inf
]
124 maxDon
= max(max(don
),maxDon
)
125 minDon
= min(min(don
),minDon
)
128 def pprint_alignment(_newSpliceAlign
,_newEstAlign
, dna_array
, est_array
):
129 (qStart
, qEnd
, tStart
, tEnd
, num_exons
, qExonSizes
, qStarts
, qEnds
, tExonSizes
, tStarts
, tEnds
) =\
130 pu
.get_splice_info(_newSpliceAlign
,_newEstAlign
)
133 translation
= '-ACGTN_' #how aligned est and dna sequence is displayed
134 #(gap_char, 5 nucleotides, intron_char)
135 comparison_char
= '| ' #first: match_char, second: intron_char
137 (exon_length
, identity
, ssQuality
, exonQuality
, comparison
, qIDX
, tIDX
) =\
138 pu
.comp_identity(dna_array
, est_array
, t_strand
, qStart
, tStart
, translation
, comparison_char
)
140 first_identity
= None
142 for i
in range(len(comparison
)):
143 if comparison
[i
] == '|' and first_identity
is None:
145 if comparison
[-i
] == '|' and last_identity
is None:
146 last_identity
= len(comparison
) - i
- 1
149 for idx
in range(len(dna_array
)):
150 dna_array
[idx
] = translation
[int(dna_array
[idx
])]
151 est_array
[idx
] = translation
[int(est_array
[idx
])]
156 line1
= "".join(dna_array
)
158 line3
= "".join(est_array
)
160 return line1
,line2
,line3
163 def get_alignment(_newSpliceAlign
,_newEstAlign
, dna_array
, est_array
):
164 return pu
.get_splice_info(_newSpliceAlign
,_newEstAlign
)
169 def combine_files(partial_files
,new_file
):
171 fh
= open(new_file
,'w')
173 print 'Problem while trying to open file: %s' % new_file
176 for pfile
in partial_files
:
177 for line
in open(pfile
):
183 def split_file(filename
,result_dir
,num_parts
):
185 Splits a file of n lines into num_parts parts
192 print 'counting lines'
194 for line
in open(filename
,'r'):
197 print 'found %d lines' % line_ctr
199 part_size
= line_ctr
/ num_parts
203 for idx
in range(1,num_parts
+1):
209 end
= begin
+part_size
+1
211 all_intervals
.append((begin
,end
))
214 for pos
,params
in enumerate(all_intervals
):
216 out_fn
= jp(result_dir
,'map.part_%d'%pos
)
218 parts_fn
.append(out_fn
)
219 out_fh
= open(out_fn
,'w+')
222 all_intervals
.reverse()
227 in_fh
= open(filename
,'r')
229 line
= in_fh
.readline()
233 if beg
<= lineCtr
< end
:
237 (beg
,end
) = all_intervals
.pop()
238 out_fn
= parts_fn
.pop()
240 out_fh
= open(out_fn
,'w+')
246 def get_slices(dataset_size
,num_nodes
):
248 Given the size of the dataset and the number of nodes which shall be used
249 this function returns the slice of the dataset for each node.
254 part
= dataset_size
/ num_nodes
257 for idx
in range(1,num_nodes
+1):
268 all_instances
.append(params
)
273 def logwrite(mes
,settings
):
275 A wrapper to write message to the global logfile.
277 fh
= open(settings
['global_log_fn'],'a')
281 if __name__
== '__main__':
282 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)