db685d1c03dcd7f7836e4f836ca0225f947c4676
[qpalma.git] / scripts / Utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import os.path
6
7 from numpy.matlib import mat,zeros,ones,inf
8
9 import palma.palma_utils as pu
10 from palma.output_formating import print_results
11
12 # some useful constants
13
14 extended_alphabet = ['-','a','c','g','t','n','[',']']
15 alphabet = ['-','a','c','g','t','n']
16
17 alignment_alphabet = ['-','a','c','g','t','n','z']
18
19
20 def print_prediction(example):
21 dna_array = example['dna_array']
22 read_array = example['read_array']
23
24 dna = map(lambda x: alignment_alphabet[x],dna_array)
25 read = map(lambda x: alignment_alphabet[x],read_array)
26
27 spliceAlign = example['spliceAlign']
28 estAlign = example['estAlign']
29
30 line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
31
32 result = '%s\n%s\n%s' %(line1,line2,line3)
33 return result
34
35 def calc_info(acc,don,exons,qualities):
36 min_score = 100
37 max_score = -100
38
39 for elem in acc:
40 for score in elem:
41 if score != -inf:
42 min_score = min(min_score,score)
43 max_score = max(max_score,score)
44
45 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
46
47 min_score = 100
48 max_score = -100
49
50 for elem in don:
51 for score in elem:
52 if score != -inf:
53 min_score = min(min_score,score)
54 max_score = max(max_score,score)
55
56 print 'Donor max/min: %f / %f' % (max_score,min_score)
57
58 min_score = 10000
59 max_score = 0
60 mean_intron_len = 0
61
62 for elem in exons:
63 intron_len = int(elem[1,0] - elem[0,1])
64 mean_intron_len += intron_len
65 min_score = min(min_score,intron_len)
66 max_score = max(max_score,intron_len)
67
68 mean_intron_len /= 1.0*len(exons)
69 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
70
71 min_score = 10000
72 max_score = 0
73 mean_quality = 0
74
75 for qVec in qualities:
76 for elem in qVec:
77 min_score = min(min_score,elem)
78 max_score = max(max_score,elem)
79
80 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
81
82
83 def calc_stat(Acceptor,Donor,Exons):
84 maxAcc = -100
85 minAcc = 100
86 maxDon = -100
87 minDon = 100
88
89 acc_vec_pos = []
90 acc_vec_neg = []
91 don_vec_pos = []
92 don_vec_neg = []
93
94 for jdx in range(len(Acceptors)):
95 currentExons = Exons[jdx]
96 currentAcceptor = Acceptors[jdx]
97 currentAcceptor = currentAcceptor[1:]
98 currentAcceptor.append(-inf)
99 currentDonor = Donors[jdx]
100
101 for idx,elem in enumerate(currentAcceptor):
102 if idx == (int(currentExons[1,0])-1): # acceptor site
103 acc_vec_pos.append(elem)
104 else:
105 acc_vec_neg.append(elem)
106
107 for idx,elem in enumerate(currentDonor):
108 if idx == (int(currentExons[0,1])): # donor site
109 don_vec_pos.append(elem)
110 else:
111 don_vec_neg.append(elem)
112
113 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
114 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
115 don_pos = [elem for elem in don_vec_pos if elem != -inf]
116 don_neg = [elem for elem in don_vec_neg if elem != -inf]
117
118 for idx in range(len(Sequences)):
119 acc = [elem for elem in Acceptors[idx] if elem != -inf]
120 maxAcc = max(max(acc),maxAcc)
121 minAcc = min(min(acc),minAcc)
122
123 don = [elem for elem in Donors[idx] if elem != -inf]
124 maxDon = max(max(don),maxDon)
125 minDon = min(min(don),minDon)
126
127
128 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
129 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
130 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
131
132 t_strand = '+'
133 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
134 #(gap_char, 5 nucleotides, intron_char)
135 comparison_char = '| ' #first: match_char, second: intron_char
136
137 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
138 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
139
140 first_identity = None
141 last_identity = None
142 for i in range(len(comparison)):
143 if comparison[i] == '|' and first_identity is None:
144 first_identity = i
145 if comparison[-i] == '|' and last_identity is None:
146 last_identity = len(comparison) - i - 1
147
148 try:
149 for idx in range(len(dna_array)):
150 dna_array[idx] = translation[int(dna_array[idx])]
151 est_array[idx] = translation[int(est_array[idx])]
152 except:
153 #pdb.set_trace()
154 pass
155
156 line1 = "".join(dna_array)
157 line2 = comparison
158 line3 = "".join(est_array)
159
160 return line1,line2,line3
161
162
163 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
164 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
165
166
167 ##########
168
169 def split_file(filename,result_dir,num_parts):
170 """
171 Splits a file of n lines into num_parts parts
172 """
173
174 jp = os.path.join
175
176 all_intervals = []
177
178 print 'counting lines'
179 line_ctr = 0
180 for line in open(filename,'r'):
181 line_ctr += 1
182
183 print 'found %d lines' % line_ctr
184
185 part_size = line_ctr / num_parts
186 begin = 0
187 end = 0
188
189 for idx in range(1,num_parts+1):
190 begin = end
191
192 if idx == num_parts:
193 end = line_ctr
194 else:
195 end = begin+part_size+1
196
197 all_intervals.append((begin,end))
198
199 parts_fn = []
200 for pos,params in enumerate(all_intervals):
201 beg,end = params
202 out_fn = jp(result_dir,'map.part_%d'%pos)
203 print out_fn
204 parts_fn.append(out_fn)
205 out_fh = open(out_fn,'w+')
206
207 parts_fn.reverse()
208 all_intervals.reverse()
209
210 lineCtr = 0
211 beg = -1
212 end = -1
213 in_fh = open(filename,'r')
214 while True:
215 line = in_fh.readline()
216 if line == '':
217 break
218
219 if beg <= lineCtr < end:
220 out_fh.write(line)
221 lineCtr += 1
222 else:
223 (beg,end) = all_intervals.pop()
224 out_fn = parts_fn.pop()
225 out_fh.close()
226 out_fh = open(out_fn,'w+')
227 out_fh.write(line)
228
229 out_fh.close()
230
231
232 if __name__ == '__main__':
233 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
234
235