+ added alignment reconstruction function
[qpalma.git] / scripts / Utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5
6 from numpy.matlib import mat,zeros,ones,inf
7
8 import palma.palma_utils as pu
9 from palma.output_formating import print_results
10
11 # some useful constants
12
13 extended_alphabet = ['-','a','c','g','t','n','[',']']
14 alphabet = ['-','a','c','g','t','n']
15
16 def calc_info(acc,don,exons,qualities):
17 min_score = 100
18 max_score = -100
19
20 for elem in acc:
21 for score in elem:
22 if score != -inf:
23 min_score = min(min_score,score)
24 max_score = max(max_score,score)
25
26 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
27
28 min_score = 100
29 max_score = -100
30
31 for elem in don:
32 for score in elem:
33 if score != -inf:
34 min_score = min(min_score,score)
35 max_score = max(max_score,score)
36
37 print 'Donor max/min: %f / %f' % (max_score,min_score)
38
39 min_score = 10000
40 max_score = 0
41 mean_intron_len = 0
42
43 for elem in exons:
44 intron_len = int(elem[1,0] - elem[0,1])
45 mean_intron_len += intron_len
46 min_score = min(min_score,intron_len)
47 max_score = max(max_score,intron_len)
48
49 mean_intron_len /= 1.0*len(exons)
50 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
51
52 min_score = 10000
53 max_score = 0
54 mean_quality = 0
55
56 for qVec in qualities:
57 for elem in qVec:
58 min_score = min(min_score,elem)
59 max_score = max(max_score,elem)
60
61 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
62
63
64 def calc_stat(Acceptor,Donor,Exons):
65 maxAcc = -100
66 minAcc = 100
67 maxDon = -100
68 minDon = 100
69
70 acc_vec_pos = []
71 acc_vec_neg = []
72 don_vec_pos = []
73 don_vec_neg = []
74
75 for jdx in range(len(Acceptors)):
76 currentExons = Exons[jdx]
77 currentAcceptor = Acceptors[jdx]
78 currentAcceptor = currentAcceptor[1:]
79 currentAcceptor.append(-inf)
80 currentDonor = Donors[jdx]
81
82 for idx,elem in enumerate(currentAcceptor):
83 if idx == (int(currentExons[1,0])-1): # acceptor site
84 acc_vec_pos.append(elem)
85 else:
86 acc_vec_neg.append(elem)
87
88 for idx,elem in enumerate(currentDonor):
89 if idx == (int(currentExons[0,1])): # donor site
90 don_vec_pos.append(elem)
91 else:
92 don_vec_neg.append(elem)
93
94 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
95 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
96 don_pos = [elem for elem in don_vec_pos if elem != -inf]
97 don_neg = [elem for elem in don_vec_neg if elem != -inf]
98
99 for idx in range(len(Sequences)):
100 acc = [elem for elem in Acceptors[idx] if elem != -inf]
101 maxAcc = max(max(acc),maxAcc)
102 minAcc = min(min(acc),minAcc)
103
104 don = [elem for elem in Donors[idx] if elem != -inf]
105 maxDon = max(max(don),maxDon)
106 minDon = min(min(don),minDon)
107
108
109 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
110 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
111 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
112
113 t_strand = '+'
114 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
115 #(gap_char, 5 nucleotides, intron_char)
116 comparison_char = '| ' #first: match_char, second: intron_char
117
118 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
119 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
120
121 first_identity = None
122 last_identity = None
123 for i in range(len(comparison)):
124 if comparison[i] == '|' and first_identity is None:
125 first_identity = i
126 if comparison[-i] == '|' and last_identity is None:
127 last_identity = len(comparison) - i - 1
128
129 try:
130 for idx in range(len(dna_array)):
131 dna_array[idx] = translation[int(dna_array[idx])]
132 est_array[idx] = translation[int(est_array[idx])]
133 except:
134 #pdb.set_trace()
135 pass
136
137 line1 = "".join(dna_array)
138 line2 = comparison
139 line3 = "".join(est_array)
140
141 return line1,line2,line3
142
143
144 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
145 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
146
147
148 ##########
149
150 def split_file_join_results(filename,parts):
151 all_intervals = []
152
153 print 'counting lines'
154 line_ctr = 0
155 for line in open(filename,'r'):
156 line_ctr += 1
157
158 print 'found %d lines' % line_ctr
159
160 part = line_ctr / parts
161 begin = 0
162 end = 0
163 for idx in range(1,parts+1):
164
165 if idx == parts:
166 begin = end
167 end = line_ctr
168 else:
169 begin = end
170 end = begin+part
171
172 params = (begin,end)
173 print begin,end
174
175 all_intervals.append(params)
176
177 pdb.set_trace()
178
179 infile = open(filename,'r')
180
181 parts_fn = []
182 for pos,params in enumerate(all_intervals):
183 beg,end = params
184 print params
185 out_fn = '%s.part_%d'%(filename,pos)
186 parts_fn.append(out_fn)
187 #out_fh = open(out_fn,'w+')
188 print out_fn
189
190 parts_fn.reverse()
191 all_intervals.reverse()
192
193 lineCtr = 0
194 beg = -1
195 end = -1
196 for line in open(filename,'r'):
197 if beg <= lineCtr < end:
198 out_fh.write(line)
199 lineCtr += 1
200 else:
201 params = all_intervals.pop()
202 beg,end = params
203 out_fn = parts_fn.pop()
204 out_fh = open(out_fn,'w+')
205
206 out_fh.close()
207
208
209 if __name__ == '__main__':
210 split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)