+ added some testcases
[qpalma.git] / scripts / Utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import os.path
6
7 from numpy.matlib import mat,zeros,ones,inf
8
9 import palma.palma_utils as pu
10 from palma.output_formating import print_results
11
12 # some useful constants
13
14 extended_alphabet = ['-','a','c','g','t','n','[',']']
15 alphabet = ['-','a','c','g','t','n']
16
17 def calc_info(acc,don,exons,qualities):
18 min_score = 100
19 max_score = -100
20
21 for elem in acc:
22 for score in elem:
23 if score != -inf:
24 min_score = min(min_score,score)
25 max_score = max(max_score,score)
26
27 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
28
29 min_score = 100
30 max_score = -100
31
32 for elem in don:
33 for score in elem:
34 if score != -inf:
35 min_score = min(min_score,score)
36 max_score = max(max_score,score)
37
38 print 'Donor max/min: %f / %f' % (max_score,min_score)
39
40 min_score = 10000
41 max_score = 0
42 mean_intron_len = 0
43
44 for elem in exons:
45 intron_len = int(elem[1,0] - elem[0,1])
46 mean_intron_len += intron_len
47 min_score = min(min_score,intron_len)
48 max_score = max(max_score,intron_len)
49
50 mean_intron_len /= 1.0*len(exons)
51 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
52
53 min_score = 10000
54 max_score = 0
55 mean_quality = 0
56
57 for qVec in qualities:
58 for elem in qVec:
59 min_score = min(min_score,elem)
60 max_score = max(max_score,elem)
61
62 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
63
64
65 def calc_stat(Acceptor,Donor,Exons):
66 maxAcc = -100
67 minAcc = 100
68 maxDon = -100
69 minDon = 100
70
71 acc_vec_pos = []
72 acc_vec_neg = []
73 don_vec_pos = []
74 don_vec_neg = []
75
76 for jdx in range(len(Acceptors)):
77 currentExons = Exons[jdx]
78 currentAcceptor = Acceptors[jdx]
79 currentAcceptor = currentAcceptor[1:]
80 currentAcceptor.append(-inf)
81 currentDonor = Donors[jdx]
82
83 for idx,elem in enumerate(currentAcceptor):
84 if idx == (int(currentExons[1,0])-1): # acceptor site
85 acc_vec_pos.append(elem)
86 else:
87 acc_vec_neg.append(elem)
88
89 for idx,elem in enumerate(currentDonor):
90 if idx == (int(currentExons[0,1])): # donor site
91 don_vec_pos.append(elem)
92 else:
93 don_vec_neg.append(elem)
94
95 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
96 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
97 don_pos = [elem for elem in don_vec_pos if elem != -inf]
98 don_neg = [elem for elem in don_vec_neg if elem != -inf]
99
100 for idx in range(len(Sequences)):
101 acc = [elem for elem in Acceptors[idx] if elem != -inf]
102 maxAcc = max(max(acc),maxAcc)
103 minAcc = min(min(acc),minAcc)
104
105 don = [elem for elem in Donors[idx] if elem != -inf]
106 maxDon = max(max(don),maxDon)
107 minDon = min(min(don),minDon)
108
109
110 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
111 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
112 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
113
114 t_strand = '+'
115 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
116 #(gap_char, 5 nucleotides, intron_char)
117 comparison_char = '| ' #first: match_char, second: intron_char
118
119 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
120 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
121
122 first_identity = None
123 last_identity = None
124 for i in range(len(comparison)):
125 if comparison[i] == '|' and first_identity is None:
126 first_identity = i
127 if comparison[-i] == '|' and last_identity is None:
128 last_identity = len(comparison) - i - 1
129
130 try:
131 for idx in range(len(dna_array)):
132 dna_array[idx] = translation[int(dna_array[idx])]
133 est_array[idx] = translation[int(est_array[idx])]
134 except:
135 #pdb.set_trace()
136 pass
137
138 line1 = "".join(dna_array)
139 line2 = comparison
140 line3 = "".join(est_array)
141
142 return line1,line2,line3
143
144
145 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
146 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
147
148
149 ##########
150
151 def split_file(filename,result_dir,num_parts):
152 """
153 Splits a file of n lines into num_parts parts
154 """
155
156 jp = os.path.join
157
158 all_intervals = []
159
160 print 'counting lines'
161 line_ctr = 0
162 for line in open(filename,'r'):
163 line_ctr += 1
164
165 print 'found %d lines' % line_ctr
166
167 part_size = line_ctr / num_parts
168 begin = 0
169 end = 0
170
171 for idx in range(1,num_parts+1):
172 begin = end
173
174 if idx == num_parts:
175 end = line_ctr
176 else:
177 end = begin+part_size+1
178
179 all_intervals.append((begin,end))
180
181 parts_fn = []
182 for pos,params in enumerate(all_intervals):
183 beg,end = params
184 out_fn = jp(result_dir,'map.part_%d'%pos)
185 print out_fn
186 parts_fn.append(out_fn)
187 out_fh = open(out_fn,'w+')
188
189 parts_fn.reverse()
190 all_intervals.reverse()
191
192 lineCtr = 0
193 beg = -1
194 end = -1
195 in_fh = open(filename,'r')
196 while True:
197 line = in_fh.readline()
198 if line == '':
199 break
200
201 if beg <= lineCtr < end:
202 out_fh.write(line)
203 lineCtr += 1
204 else:
205 (beg,end) = all_intervals.pop()
206 out_fn = parts_fn.pop()
207 out_fh.close()
208 out_fh = open(out_fn,'w+')
209 out_fh.write(line)
210
211 out_fh.close()
212
213
214 if __name__ == '__main__':
215 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
216
217