+ fixed minor bugs
[qpalma.git] / qpalma / utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import pdb
10 import sys
11 import os.path
12
13 from numpy.matlib import mat,zeros,ones,inf
14
15 import palma_utils as pu
16
17 # some useful constants
18
19 extended_alphabet = ['-','a','c','g','t','n','[',']']
20 alphabet = ['-','a','c','g','t','n']
21
22 alignment_alphabet = ['-','a','c','g','t','n','z']
23
24
25 def print_prediction(example):
26 dna_array = example['dna_array']
27 read_array = example['read_array']
28
29 dna = map(lambda x: alignment_alphabet[x],dna_array)
30 read = map(lambda x: alignment_alphabet[x],read_array)
31
32 spliceAlign = example['spliceAlign']
33 estAlign = example['estAlign']
34
35 line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
36
37 result = '%s\n%s\n%s' %(line1,line2,line3)
38 return result
39
40 def calc_info(acc,don,exons,qualities):
41 min_score = 100
42 max_score = -100
43
44 for elem in acc:
45 for score in elem:
46 if score != -inf:
47 min_score = min(min_score,score)
48 max_score = max(max_score,score)
49
50 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
51
52 min_score = 100
53 max_score = -100
54
55 for elem in don:
56 for score in elem:
57 if score != -inf:
58 min_score = min(min_score,score)
59 max_score = max(max_score,score)
60
61 print 'Donor max/min: %f / %f' % (max_score,min_score)
62
63 min_score = 10000
64 max_score = 0
65 mean_intron_len = 0
66
67 for elem in exons:
68 intron_len = int(elem[1,0] - elem[0,1])
69 mean_intron_len += intron_len
70 min_score = min(min_score,intron_len)
71 max_score = max(max_score,intron_len)
72
73 mean_intron_len /= 1.0*len(exons)
74 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
75
76 min_score = 10000
77 max_score = 0
78 mean_quality = 0
79
80 for qVec in qualities:
81 for elem in qVec:
82 min_score = min(min_score,elem)
83 max_score = max(max_score,elem)
84
85 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
86
87
88 def calc_stat(Acceptor,Donor,Exons):
89 maxAcc = -100
90 minAcc = 100
91 maxDon = -100
92 minDon = 100
93
94 acc_vec_pos = []
95 acc_vec_neg = []
96 don_vec_pos = []
97 don_vec_neg = []
98
99 for jdx in range(len(Acceptors)):
100 currentExons = Exons[jdx]
101 currentAcceptor = Acceptors[jdx]
102 currentAcceptor = currentAcceptor[1:]
103 currentAcceptor.append(-inf)
104 currentDonor = Donors[jdx]
105
106 for idx,elem in enumerate(currentAcceptor):
107 if idx == (int(currentExons[1,0])-1): # acceptor site
108 acc_vec_pos.append(elem)
109 else:
110 acc_vec_neg.append(elem)
111
112 for idx,elem in enumerate(currentDonor):
113 if idx == (int(currentExons[0,1])): # donor site
114 don_vec_pos.append(elem)
115 else:
116 don_vec_neg.append(elem)
117
118 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
119 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
120 don_pos = [elem for elem in don_vec_pos if elem != -inf]
121 don_neg = [elem for elem in don_vec_neg if elem != -inf]
122
123 for idx in range(len(Sequences)):
124 acc = [elem for elem in Acceptors[idx] if elem != -inf]
125 maxAcc = max(max(acc),maxAcc)
126 minAcc = min(min(acc),minAcc)
127
128 don = [elem for elem in Donors[idx] if elem != -inf]
129 maxDon = max(max(don),maxDon)
130 minDon = min(min(don),minDon)
131
132
133 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
134 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
135 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
136
137 t_strand = '+'
138 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
139 #(gap_char, 5 nucleotides, intron_char)
140 comparison_char = '| ' #first: match_char, second: intron_char
141
142 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
143 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
144
145 first_identity = None
146 last_identity = None
147 for i in range(len(comparison)):
148 if comparison[i] == '|' and first_identity is None:
149 first_identity = i
150 if comparison[-i] == '|' and last_identity is None:
151 last_identity = len(comparison) - i - 1
152
153 try:
154 for idx in range(len(dna_array)):
155 dna_array[idx] = translation[int(dna_array[idx])]
156 est_array[idx] = translation[int(est_array[idx])]
157 except:
158 #pdb.set_trace()
159 pass
160
161 line1 = "".join(dna_array)
162 line2 = comparison
163 line3 = "".join(est_array)
164
165 return line1,line2,line3
166
167
168 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
169 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
170
171
172 ##########
173
174 def combine_files(partial_files,new_file):
175 try:
176 fh = open(new_file,'w')
177 except IOError:
178 print 'Problem while trying to open file: %s' % new_file
179 sys.exit(1)
180
181 for pfile in partial_files:
182 for line in open(pfile):
183 fh.write(line)
184
185 fh.close()
186
187
188 def split_file(filename,result_dir,num_parts):
189 """
190 Splits a file of n lines into num_parts parts
191 """
192
193 jp = os.path.join
194
195 all_intervals = []
196
197 print 'counting lines'
198 line_ctr = 0
199 for line in open(filename,'r'):
200 line_ctr += 1
201
202 print 'found %d lines' % line_ctr
203
204 part_size = line_ctr / num_parts
205 begin = 0
206 end = 0
207
208 for idx in range(1,num_parts+1):
209 begin = end
210
211 if idx == num_parts:
212 end = line_ctr
213 else:
214 end = begin+part_size+1
215
216 all_intervals.append((begin,end))
217
218 parts_fn = []
219 for pos,params in enumerate(all_intervals):
220 beg,end = params
221 out_fn = jp(result_dir,'map.part_%d'%pos)
222 print out_fn
223 parts_fn.append(out_fn)
224 out_fh = open(out_fn,'w+')
225
226 parts_fn.reverse()
227 all_intervals.reverse()
228
229 lineCtr = 0
230 beg = -1
231 end = -1
232 in_fh = open(filename,'r')
233 while True:
234 line = in_fh.readline()
235 if line == '':
236 break
237
238 if beg <= lineCtr < end:
239 out_fh.write(line)
240 lineCtr += 1
241 else:
242 (beg,end) = all_intervals.pop()
243 out_fn = parts_fn.pop()
244 out_fh.close()
245 out_fh = open(out_fn,'w+')
246 out_fh.write(line)
247
248 out_fh.close()
249
250
251 def get_slices(dataset_size,num_nodes):
252 """
253 Given the size of the dataset and the number of nodes which shall be used
254 this function returns the slice of the dataset for each node.
255 """
256
257 all_instances = []
258
259 part = dataset_size / num_nodes
260 begin = 0
261 end = 0
262 for idx in range(1,num_nodes+1):
263
264 if idx == num_nodes:
265 begin = end
266 end = dataset_size
267 else:
268 begin = end
269 end = begin+part
270
271 params = (begin,end)
272
273 all_instances.append(params)
274
275 return all_instances
276
277
278 def logwrite(mes,settings):
279 """
280 A wrapper to write message to the global logfile.
281 """
282 fh = open(settings['global_log_fn'],'a')
283 fh.write(mes)
284
285
286 if __name__ == '__main__':
287 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)