74d83a4b1c64ffba68ab6ff897cc55259e4b1e5d
[qpalma.git] / scripts / Utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5
6 from numpy.matlib import mat,zeros,ones,inf
7
8 import palma.palma_utils as pu
9 from palma.output_formating import print_results
10
11 # some useful constants
12
13 extended_alphabet = ['-','a','c','g','t','n','[',']']
14 alphabet = ['-','a','c','g','t','n']
15
16 def calc_info(acc,don,exons,qualities):
17 min_score = 100
18 max_score = -100
19
20 for elem in acc:
21 for score in elem:
22 if score != -inf:
23 min_score = min(min_score,score)
24 max_score = max(max_score,score)
25
26 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
27
28 min_score = 100
29 max_score = -100
30
31 for elem in don:
32 for score in elem:
33 if score != -inf:
34 min_score = min(min_score,score)
35 max_score = max(max_score,score)
36
37 print 'Donor max/min: %f / %f' % (max_score,min_score)
38
39 min_score = 10000
40 max_score = 0
41 mean_intron_len = 0
42
43 for elem in exons:
44 intron_len = int(elem[1,0] - elem[0,1])
45 mean_intron_len += intron_len
46 min_score = min(min_score,intron_len)
47 max_score = max(max_score,intron_len)
48
49 mean_intron_len /= 1.0*len(exons)
50 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
51
52 min_score = 10000
53 max_score = 0
54 mean_quality = 0
55
56 for qVec in qualities:
57 for elem in qVec:
58 min_score = min(min_score,elem)
59 max_score = max(max_score,elem)
60
61 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
62
63
64 def calc_stat(Acceptor,Donor,Exons):
65 maxAcc = -100
66 minAcc = 100
67 maxDon = -100
68 minDon = 100
69
70 acc_vec_pos = []
71 acc_vec_neg = []
72 don_vec_pos = []
73 don_vec_neg = []
74
75 for jdx in range(len(Acceptors)):
76 currentExons = Exons[jdx]
77 currentAcceptor = Acceptors[jdx]
78 currentAcceptor = currentAcceptor[1:]
79 currentAcceptor.append(-inf)
80 currentDonor = Donors[jdx]
81
82 for idx,elem in enumerate(currentAcceptor):
83 if idx == (int(currentExons[1,0])-1): # acceptor site
84 acc_vec_pos.append(elem)
85 else:
86 acc_vec_neg.append(elem)
87
88 for idx,elem in enumerate(currentDonor):
89 if idx == (int(currentExons[0,1])): # donor site
90 don_vec_pos.append(elem)
91 else:
92 don_vec_neg.append(elem)
93
94 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
95 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
96 don_pos = [elem for elem in don_vec_pos if elem != -inf]
97 don_neg = [elem for elem in don_vec_neg if elem != -inf]
98
99 for idx in range(len(Sequences)):
100 acc = [elem for elem in Acceptors[idx] if elem != -inf]
101 maxAcc = max(max(acc),maxAcc)
102 minAcc = min(min(acc),minAcc)
103
104 don = [elem for elem in Donors[idx] if elem != -inf]
105 maxDon = max(max(don),maxDon)
106 minDon = min(min(don),minDon)
107
108
109 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
110 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
111 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
112
113 t_strand = '+'
114 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
115 #(gap_char, 5 nucleotides, intron_char)
116 comparison_char = '| ' #first: match_char, second: intron_char
117
118 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
119 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
120
121 first_identity = None
122 last_identity = None
123 for i in range(len(comparison)):
124 if comparison[i] == '|' and first_identity is None:
125 first_identity = i
126 if comparison[-i] == '|' and last_identity is None:
127 last_identity = len(comparison) - i - 1
128
129 try:
130 for idx in range(len(dna_array)):
131 dna_array[idx] = translation[int(dna_array[idx])]
132 est_array[idx] = translation[int(est_array[idx])]
133 except:
134 pdb.set_trace()
135
136 line1 = "".join(dna_array)
137 line2 = comparison
138 line3 = "".join(est_array)
139
140 return line1,line2,line3
141
142
143 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
144 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
145
146
147 ##########
148
149 def split_file_join_results(filename,parts):
150 all_intervals = []
151
152 print 'counting lines'
153 line_ctr = 0
154 for line in open(filename,'r'):
155 line_ctr += 1
156
157 print 'found %d lines' % line_ctr
158
159 part = line_ctr / parts
160 begin = 0
161 end = 0
162 for idx in range(1,parts+1):
163
164 if idx == parts:
165 begin = end
166 end = line_ctr
167 else:
168 begin = end
169 end = begin+part
170
171 params = (begin,end)
172 print begin,end
173
174 all_intervals.append(params)
175
176 pdb.set_trace()
177
178 infile = open(filename,'r')
179
180 parts_fn = []
181 for pos,params in enumerate(all_intervals):
182 beg,end = params
183 print params
184 out_fn = '%s.part_%d'%(filename,pos)
185 parts_fn.append(out_fn)
186 #out_fh = open(out_fn,'w+')
187 print out_fn
188
189 parts_fn.reverse()
190 all_intervals.reverse()
191
192 lineCtr = 0
193 beg = -1
194 end = -1
195 for line in open(filename,'r'):
196 if beg <= lineCtr < end:
197 out_fh.write(line)
198 lineCtr += 1
199 else:
200 params = all_intervals.pop()
201 beg,end = params
202 out_fn = parts_fn.pop()
203 out_fh = open(out_fn,'w+')
204
205 out_fh.close()
206
207
208 if __name__ == '__main__':
209 split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)