+ added settings in the form of a global and a run specific part
[qpalma.git] / qpalma / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import os.path
6
7 from numpy.matlib import mat,zeros,ones,inf
8
9 import palma.palma_utils as pu
10 from palma.output_formating import print_results
11
12 # some useful constants
13
14 extended_alphabet = ['-','a','c','g','t','n','[',']']
15 alphabet = ['-','a','c','g','t','n']
16
17 alignment_alphabet = ['-','a','c','g','t','n','z']
18
19
20 def print_prediction(example):
21 dna_array = example['dna_array']
22 read_array = example['read_array']
23
24 dna = map(lambda x: alignment_alphabet[x],dna_array)
25 read = map(lambda x: alignment_alphabet[x],read_array)
26
27 spliceAlign = example['spliceAlign']
28 estAlign = example['estAlign']
29
30 line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
31
32 result = '%s\n%s\n%s' %(line1,line2,line3)
33 return result
34
35 def calc_info(acc,don,exons,qualities):
36 min_score = 100
37 max_score = -100
38
39 for elem in acc:
40 for score in elem:
41 if score != -inf:
42 min_score = min(min_score,score)
43 max_score = max(max_score,score)
44
45 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
46
47 min_score = 100
48 max_score = -100
49
50 for elem in don:
51 for score in elem:
52 if score != -inf:
53 min_score = min(min_score,score)
54 max_score = max(max_score,score)
55
56 print 'Donor max/min: %f / %f' % (max_score,min_score)
57
58 min_score = 10000
59 max_score = 0
60 mean_intron_len = 0
61
62 for elem in exons:
63 intron_len = int(elem[1,0] - elem[0,1])
64 mean_intron_len += intron_len
65 min_score = min(min_score,intron_len)
66 max_score = max(max_score,intron_len)
67
68 mean_intron_len /= 1.0*len(exons)
69 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
70
71 min_score = 10000
72 max_score = 0
73 mean_quality = 0
74
75 for qVec in qualities:
76 for elem in qVec:
77 min_score = min(min_score,elem)
78 max_score = max(max_score,elem)
79
80 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
81
82
83 def calc_stat(Acceptor,Donor,Exons):
84 maxAcc = -100
85 minAcc = 100
86 maxDon = -100
87 minDon = 100
88
89 acc_vec_pos = []
90 acc_vec_neg = []
91 don_vec_pos = []
92 don_vec_neg = []
93
94 for jdx in range(len(Acceptors)):
95 currentExons = Exons[jdx]
96 currentAcceptor = Acceptors[jdx]
97 currentAcceptor = currentAcceptor[1:]
98 currentAcceptor.append(-inf)
99 currentDonor = Donors[jdx]
100
101 for idx,elem in enumerate(currentAcceptor):
102 if idx == (int(currentExons[1,0])-1): # acceptor site
103 acc_vec_pos.append(elem)
104 else:
105 acc_vec_neg.append(elem)
106
107 for idx,elem in enumerate(currentDonor):
108 if idx == (int(currentExons[0,1])): # donor site
109 don_vec_pos.append(elem)
110 else:
111 don_vec_neg.append(elem)
112
113 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
114 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
115 don_pos = [elem for elem in don_vec_pos if elem != -inf]
116 don_neg = [elem for elem in don_vec_neg if elem != -inf]
117
118 for idx in range(len(Sequences)):
119 acc = [elem for elem in Acceptors[idx] if elem != -inf]
120 maxAcc = max(max(acc),maxAcc)
121 minAcc = min(min(acc),minAcc)
122
123 don = [elem for elem in Donors[idx] if elem != -inf]
124 maxDon = max(max(don),maxDon)
125 minDon = min(min(don),minDon)
126
127
128 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
129 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
130 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
131
132 t_strand = '+'
133 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
134 #(gap_char, 5 nucleotides, intron_char)
135 comparison_char = '| ' #first: match_char, second: intron_char
136
137 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
138 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
139
140 first_identity = None
141 last_identity = None
142 for i in range(len(comparison)):
143 if comparison[i] == '|' and first_identity is None:
144 first_identity = i
145 if comparison[-i] == '|' and last_identity is None:
146 last_identity = len(comparison) - i - 1
147
148 try:
149 for idx in range(len(dna_array)):
150 dna_array[idx] = translation[int(dna_array[idx])]
151 est_array[idx] = translation[int(est_array[idx])]
152 except:
153 #pdb.set_trace()
154 pass
155
156 line1 = "".join(dna_array)
157 line2 = comparison
158 line3 = "".join(est_array)
159
160 return line1,line2,line3
161
162
163 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
164 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
165
166
167 ##########
168
169 def split_file(filename,result_dir,num_parts):
170 """
171 Splits a file of n lines into num_parts parts
172 """
173
174 jp = os.path.join
175
176 all_intervals = []
177
178 print 'counting lines'
179 line_ctr = 0
180 for line in open(filename,'r'):
181 line_ctr += 1
182
183 print 'found %d lines' % line_ctr
184
185 part_size = line_ctr / num_parts
186 begin = 0
187 end = 0
188
189 for idx in range(1,num_parts+1):
190 begin = end
191
192 if idx == num_parts:
193 end = line_ctr
194 else:
195 end = begin+part_size+1
196
197 all_intervals.append((begin,end))
198
199 parts_fn = []
200 for pos,params in enumerate(all_intervals):
201 beg,end = params
202 out_fn = jp(result_dir,'map.part_%d'%pos)
203 print out_fn
204 parts_fn.append(out_fn)
205 out_fh = open(out_fn,'w+')
206
207 parts_fn.reverse()
208 all_intervals.reverse()
209
210 lineCtr = 0
211 beg = -1
212 end = -1
213 in_fh = open(filename,'r')
214 while True:
215 line = in_fh.readline()
216 if line == '':
217 break
218
219 if beg <= lineCtr < end:
220 out_fh.write(line)
221 lineCtr += 1
222 else:
223 (beg,end) = all_intervals.pop()
224 out_fn = parts_fn.pop()
225 out_fh.close()
226 out_fh = open(out_fn,'w+')
227 out_fh.write(line)
228
229 out_fh.close()
230
231
232 def get_slices(dataset_size,num_nodes):
233 """
234 Given the size of the dataset and the number of nodes which shall be used
235 this function returns the slice of the dataset for each node.
236 """
237
238 all_instances = []
239
240 part = dataset_size / num_nodes
241 begin = 0
242 end = 0
243 for idx in range(1,num_nodes+1):
244
245 if idx == num_nodes:
246 begin = end
247 end = dataset_size
248 else:
249 begin = end
250 end = begin+part
251
252 params = (begin,end)
253
254 all_instances.append(params)
255
256 return all_instances
257
258
259 if __name__ == '__main__':
260 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)