f80f7ad7a2b8e6b182713d5d6c5c62420d399cb1
[qpalma.git] / qpalma / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import sys
6 import os.path
7
8 from numpy.matlib import mat,zeros,ones,inf
9
10 import palma.palma_utils as pu
11 from palma.output_formating import print_results
12
13 # some useful constants
14
15 extended_alphabet = ['-','a','c','g','t','n','[',']']
16 alphabet = ['-','a','c','g','t','n']
17
18 alignment_alphabet = ['-','a','c','g','t','n','z']
19
20
21 def print_prediction(example):
22 dna_array = example['dna_array']
23 read_array = example['read_array']
24
25 dna = map(lambda x: alignment_alphabet[x],dna_array)
26 read = map(lambda x: alignment_alphabet[x],read_array)
27
28 spliceAlign = example['spliceAlign']
29 estAlign = example['estAlign']
30
31 line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
32
33 result = '%s\n%s\n%s' %(line1,line2,line3)
34 return result
35
36 def calc_info(acc,don,exons,qualities):
37 min_score = 100
38 max_score = -100
39
40 for elem in acc:
41 for score in elem:
42 if score != -inf:
43 min_score = min(min_score,score)
44 max_score = max(max_score,score)
45
46 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
47
48 min_score = 100
49 max_score = -100
50
51 for elem in don:
52 for score in elem:
53 if score != -inf:
54 min_score = min(min_score,score)
55 max_score = max(max_score,score)
56
57 print 'Donor max/min: %f / %f' % (max_score,min_score)
58
59 min_score = 10000
60 max_score = 0
61 mean_intron_len = 0
62
63 for elem in exons:
64 intron_len = int(elem[1,0] - elem[0,1])
65 mean_intron_len += intron_len
66 min_score = min(min_score,intron_len)
67 max_score = max(max_score,intron_len)
68
69 mean_intron_len /= 1.0*len(exons)
70 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
71
72 min_score = 10000
73 max_score = 0
74 mean_quality = 0
75
76 for qVec in qualities:
77 for elem in qVec:
78 min_score = min(min_score,elem)
79 max_score = max(max_score,elem)
80
81 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
82
83
84 def calc_stat(Acceptor,Donor,Exons):
85 maxAcc = -100
86 minAcc = 100
87 maxDon = -100
88 minDon = 100
89
90 acc_vec_pos = []
91 acc_vec_neg = []
92 don_vec_pos = []
93 don_vec_neg = []
94
95 for jdx in range(len(Acceptors)):
96 currentExons = Exons[jdx]
97 currentAcceptor = Acceptors[jdx]
98 currentAcceptor = currentAcceptor[1:]
99 currentAcceptor.append(-inf)
100 currentDonor = Donors[jdx]
101
102 for idx,elem in enumerate(currentAcceptor):
103 if idx == (int(currentExons[1,0])-1): # acceptor site
104 acc_vec_pos.append(elem)
105 else:
106 acc_vec_neg.append(elem)
107
108 for idx,elem in enumerate(currentDonor):
109 if idx == (int(currentExons[0,1])): # donor site
110 don_vec_pos.append(elem)
111 else:
112 don_vec_neg.append(elem)
113
114 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
115 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
116 don_pos = [elem for elem in don_vec_pos if elem != -inf]
117 don_neg = [elem for elem in don_vec_neg if elem != -inf]
118
119 for idx in range(len(Sequences)):
120 acc = [elem for elem in Acceptors[idx] if elem != -inf]
121 maxAcc = max(max(acc),maxAcc)
122 minAcc = min(min(acc),minAcc)
123
124 don = [elem for elem in Donors[idx] if elem != -inf]
125 maxDon = max(max(don),maxDon)
126 minDon = min(min(don),minDon)
127
128
129 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
130 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
131 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
132
133 t_strand = '+'
134 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
135 #(gap_char, 5 nucleotides, intron_char)
136 comparison_char = '| ' #first: match_char, second: intron_char
137
138 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
139 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
140
141 first_identity = None
142 last_identity = None
143 for i in range(len(comparison)):
144 if comparison[i] == '|' and first_identity is None:
145 first_identity = i
146 if comparison[-i] == '|' and last_identity is None:
147 last_identity = len(comparison) - i - 1
148
149 try:
150 for idx in range(len(dna_array)):
151 dna_array[idx] = translation[int(dna_array[idx])]
152 est_array[idx] = translation[int(est_array[idx])]
153 except:
154 #pdb.set_trace()
155 pass
156
157 line1 = "".join(dna_array)
158 line2 = comparison
159 line3 = "".join(est_array)
160
161 return line1,line2,line3
162
163
164 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
165 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
166
167
168 ##########
169
170 def combine_files(partial_files,new_file):
171 try:
172 fh = open(new_file,'w')
173 except IOError:
174 print 'Problem while trying to open file: %s' % new_file
175 sys.exit(1)
176
177 for pfile in partial_files:
178 for line in open(pfile):
179 fh.write(line)
180
181 fh.close()
182
183
184 def split_file(filename,result_dir,num_parts):
185 """
186 Splits a file of n lines into num_parts parts
187 """
188
189 jp = os.path.join
190
191 all_intervals = []
192
193 print 'counting lines'
194 line_ctr = 0
195 for line in open(filename,'r'):
196 line_ctr += 1
197
198 print 'found %d lines' % line_ctr
199
200 part_size = line_ctr / num_parts
201 begin = 0
202 end = 0
203
204 for idx in range(1,num_parts+1):
205 begin = end
206
207 if idx == num_parts:
208 end = line_ctr
209 else:
210 end = begin+part_size+1
211
212 all_intervals.append((begin,end))
213
214 parts_fn = []
215 for pos,params in enumerate(all_intervals):
216 beg,end = params
217 out_fn = jp(result_dir,'map.part_%d'%pos)
218 print out_fn
219 parts_fn.append(out_fn)
220 out_fh = open(out_fn,'w+')
221
222 parts_fn.reverse()
223 all_intervals.reverse()
224
225 lineCtr = 0
226 beg = -1
227 end = -1
228 in_fh = open(filename,'r')
229 while True:
230 line = in_fh.readline()
231 if line == '':
232 break
233
234 if beg <= lineCtr < end:
235 out_fh.write(line)
236 lineCtr += 1
237 else:
238 (beg,end) = all_intervals.pop()
239 out_fn = parts_fn.pop()
240 out_fh.close()
241 out_fh = open(out_fn,'w+')
242 out_fh.write(line)
243
244 out_fh.close()
245
246
247 def get_slices(dataset_size,num_nodes):
248 """
249 Given the size of the dataset and the number of nodes which shall be used
250 this function returns the slice of the dataset for each node.
251 """
252
253 all_instances = []
254
255 part = dataset_size / num_nodes
256 begin = 0
257 end = 0
258 for idx in range(1,num_nodes+1):
259
260 if idx == num_nodes:
261 begin = end
262 end = dataset_size
263 else:
264 begin = end
265 end = begin+part
266
267 params = (begin,end)
268
269 all_instances.append(params)
270
271 return all_instances
272
273
274 def logwrite(mes,settings):
275 """
276 A wrapper to write message to the global logfile.
277 """
278 fh = open(settings['global_log_fn'],'a')
279 fh.write(mes)
280
281
282 if __name__ == '__main__':
283 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)