+ added some documentary text
[qpalma.git] / qpalma / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import os.path
6
7 from numpy.matlib import mat,zeros,ones,inf
8
9 import palma.palma_utils as pu
10 from palma.output_formating import print_results
11
12 # some useful constants
13
14 extended_alphabet = ['-','a','c','g','t','n','[',']']
15 alphabet = ['-','a','c','g','t','n']
16
17 alignment_alphabet = ['-','a','c','g','t','n','z']
18
19
20 def print_prediction(example):
21 dna_array = example['dna_array']
22 read_array = example['read_array']
23
24 dna = map(lambda x: alignment_alphabet[x],dna_array)
25 read = map(lambda x: alignment_alphabet[x],read_array)
26
27 spliceAlign = example['spliceAlign']
28 estAlign = example['estAlign']
29
30 line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
31
32 result = '%s\n%s\n%s' %(line1,line2,line3)
33 return result
34
35 def calc_info(acc,don,exons,qualities):
36 min_score = 100
37 max_score = -100
38
39 for elem in acc:
40 for score in elem:
41 if score != -inf:
42 min_score = min(min_score,score)
43 max_score = max(max_score,score)
44
45 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
46
47 min_score = 100
48 max_score = -100
49
50 for elem in don:
51 for score in elem:
52 if score != -inf:
53 min_score = min(min_score,score)
54 max_score = max(max_score,score)
55
56 print 'Donor max/min: %f / %f' % (max_score,min_score)
57
58 min_score = 10000
59 max_score = 0
60 mean_intron_len = 0
61
62 for elem in exons:
63 intron_len = int(elem[1,0] - elem[0,1])
64 mean_intron_len += intron_len
65 min_score = min(min_score,intron_len)
66 max_score = max(max_score,intron_len)
67
68 mean_intron_len /= 1.0*len(exons)
69 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
70
71 min_score = 10000
72 max_score = 0
73 mean_quality = 0
74
75 for qVec in qualities:
76 for elem in qVec:
77 min_score = min(min_score,elem)
78 max_score = max(max_score,elem)
79
80 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
81
82
83 def calc_stat(Acceptor,Donor,Exons):
84 maxAcc = -100
85 minAcc = 100
86 maxDon = -100
87 minDon = 100
88
89 acc_vec_pos = []
90 acc_vec_neg = []
91 don_vec_pos = []
92 don_vec_neg = []
93
94 for jdx in range(len(Acceptors)):
95 currentExons = Exons[jdx]
96 currentAcceptor = Acceptors[jdx]
97 currentAcceptor = currentAcceptor[1:]
98 currentAcceptor.append(-inf)
99 currentDonor = Donors[jdx]
100
101 for idx,elem in enumerate(currentAcceptor):
102 if idx == (int(currentExons[1,0])-1): # acceptor site
103 acc_vec_pos.append(elem)
104 else:
105 acc_vec_neg.append(elem)
106
107 for idx,elem in enumerate(currentDonor):
108 if idx == (int(currentExons[0,1])): # donor site
109 don_vec_pos.append(elem)
110 else:
111 don_vec_neg.append(elem)
112
113 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
114 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
115 don_pos = [elem for elem in don_vec_pos if elem != -inf]
116 don_neg = [elem for elem in don_vec_neg if elem != -inf]
117
118 for idx in range(len(Sequences)):
119 acc = [elem for elem in Acceptors[idx] if elem != -inf]
120 maxAcc = max(max(acc),maxAcc)
121 minAcc = min(min(acc),minAcc)
122
123 don = [elem for elem in Donors[idx] if elem != -inf]
124 maxDon = max(max(don),maxDon)
125 minDon = min(min(don),minDon)
126
127
128 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
129 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
130 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
131
132 t_strand = '+'
133 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
134 #(gap_char, 5 nucleotides, intron_char)
135 comparison_char = '| ' #first: match_char, second: intron_char
136
137 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
138 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
139
140 first_identity = None
141 last_identity = None
142 for i in range(len(comparison)):
143 if comparison[i] == '|' and first_identity is None:
144 first_identity = i
145 if comparison[-i] == '|' and last_identity is None:
146 last_identity = len(comparison) - i - 1
147
148 try:
149 for idx in range(len(dna_array)):
150 dna_array[idx] = translation[int(dna_array[idx])]
151 est_array[idx] = translation[int(est_array[idx])]
152 except:
153 #pdb.set_trace()
154 pass
155
156 line1 = "".join(dna_array)
157 line2 = comparison
158 line3 = "".join(est_array)
159
160 return line1,line2,line3
161
162
163 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
164 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
165
166
167 ##########
168
169 def combine_files(partial_files,new_file):
170 try:
171 fh = open(new_file,'w')
172 except IOError:
173 print 'Problem while trying to open file: %s' % new_file
174 sys.exit(1)
175
176 for pfile in partial_files:
177 for line in open(pfile):
178 fh.write(line)
179
180 fh.close()
181
182
183 def split_file(filename,result_dir,num_parts):
184 """
185 Splits a file of n lines into num_parts parts
186 """
187
188 jp = os.path.join
189
190 all_intervals = []
191
192 print 'counting lines'
193 line_ctr = 0
194 for line in open(filename,'r'):
195 line_ctr += 1
196
197 print 'found %d lines' % line_ctr
198
199 part_size = line_ctr / num_parts
200 begin = 0
201 end = 0
202
203 for idx in range(1,num_parts+1):
204 begin = end
205
206 if idx == num_parts:
207 end = line_ctr
208 else:
209 end = begin+part_size+1
210
211 all_intervals.append((begin,end))
212
213 parts_fn = []
214 for pos,params in enumerate(all_intervals):
215 beg,end = params
216 out_fn = jp(result_dir,'map.part_%d'%pos)
217 print out_fn
218 parts_fn.append(out_fn)
219 out_fh = open(out_fn,'w+')
220
221 parts_fn.reverse()
222 all_intervals.reverse()
223
224 lineCtr = 0
225 beg = -1
226 end = -1
227 in_fh = open(filename,'r')
228 while True:
229 line = in_fh.readline()
230 if line == '':
231 break
232
233 if beg <= lineCtr < end:
234 out_fh.write(line)
235 lineCtr += 1
236 else:
237 (beg,end) = all_intervals.pop()
238 out_fn = parts_fn.pop()
239 out_fh.close()
240 out_fh = open(out_fn,'w+')
241 out_fh.write(line)
242
243 out_fh.close()
244
245
246 def get_slices(dataset_size,num_nodes):
247 """
248 Given the size of the dataset and the number of nodes which shall be used
249 this function returns the slice of the dataset for each node.
250 """
251
252 all_instances = []
253
254 part = dataset_size / num_nodes
255 begin = 0
256 end = 0
257 for idx in range(1,num_nodes+1):
258
259 if idx == num_nodes:
260 begin = end
261 end = dataset_size
262 else:
263 begin = end
264 end = begin+part
265
266 params = (begin,end)
267
268 all_instances.append(params)
269
270 return all_instances
271
272
273 def logwrite(mes,settings):
274 """
275 A wrapper to write message to the global logfile.
276 """
277 fh = open(settings['global_log_fn'],'a')
278 fh.write(mes)
279
280
281 if __name__ == '__main__':
282 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)