+ fixed some bugs in the negative strand lookup table
[qpalma.git] / scripts / Utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5
6 import palma.palma_utils as pu
7 from palma.output_formating import print_results
8
9 # some useful constants
10
11 extended_alphabet = ['-','a','c','g','t','n','[',']']
12 alphabet = ['-','a','c','g','t','n']
13
14 def calc_info(acc,don,exons,qualities):
15 min_score = 100
16 max_score = -100
17
18 for elem in acc:
19 for score in elem:
20 if score != -inf:
21 min_score = min(min_score,score)
22 max_score = max(max_score,score)
23
24 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
25
26 min_score = 100
27 max_score = -100
28
29 for elem in don:
30 for score in elem:
31 if score != -inf:
32 min_score = min(min_score,score)
33 max_score = max(max_score,score)
34
35 print 'Donor max/min: %f / %f' % (max_score,min_score)
36
37 min_score = 10000
38 max_score = 0
39 mean_intron_len = 0
40
41 for elem in exons:
42 intron_len = int(elem[1,0] - elem[0,1])
43 mean_intron_len += intron_len
44 min_score = min(min_score,intron_len)
45 max_score = max(max_score,intron_len)
46
47 mean_intron_len /= 1.0*len(exons)
48 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
49
50 min_score = 10000
51 max_score = 0
52 mean_quality = 0
53
54 for qVec in qualities:
55 for elem in qVec:
56 min_score = min(min_score,elem)
57 max_score = max(max_score,elem)
58
59 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
60
61
62 def calc_stat(Acceptor,Donor,Exons):
63 maxAcc = -100
64 minAcc = 100
65 maxDon = -100
66 minDon = 100
67
68 acc_vec_pos = []
69 acc_vec_neg = []
70 don_vec_pos = []
71 don_vec_neg = []
72
73 for jdx in range(len(Acceptors)):
74 currentExons = Exons[jdx]
75 currentAcceptor = Acceptors[jdx]
76 currentAcceptor = currentAcceptor[1:]
77 currentAcceptor.append(-inf)
78 currentDonor = Donors[jdx]
79
80 for idx,elem in enumerate(currentAcceptor):
81 if idx == (int(currentExons[1,0])-1): # acceptor site
82 acc_vec_pos.append(elem)
83 else:
84 acc_vec_neg.append(elem)
85
86 for idx,elem in enumerate(currentDonor):
87 if idx == (int(currentExons[0,1])): # donor site
88 don_vec_pos.append(elem)
89 else:
90 don_vec_neg.append(elem)
91
92 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
93 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
94 don_pos = [elem for elem in don_vec_pos if elem != -inf]
95 don_neg = [elem for elem in don_vec_neg if elem != -inf]
96
97 for idx in range(len(Sequences)):
98 acc = [elem for elem in Acceptors[idx] if elem != -inf]
99 maxAcc = max(max(acc),maxAcc)
100 minAcc = min(min(acc),minAcc)
101
102 don = [elem for elem in Donors[idx] if elem != -inf]
103 maxDon = max(max(don),maxDon)
104 minDon = min(min(don),minDon)
105
106
107 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
108 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
109 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
110
111 t_strand = '+'
112 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
113 #(gap_char, 5 nucleotides, intron_char)
114 comparison_char = '| ' #first: match_char, second: intron_char
115
116 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
117 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
118
119 first_identity = None
120 last_identity = None
121 for i in range(len(comparison)):
122 if comparison[i] == '|' and first_identity is None:
123 first_identity = i
124 if comparison[-i] == '|' and last_identity is None:
125 last_identity = len(comparison) - i - 1
126
127 try:
128 for idx in range(len(dna_array)):
129 dna_array[idx] = translation[int(dna_array[idx])]
130 est_array[idx] = translation[int(est_array[idx])]
131 except:
132 pdb.set_trace()
133
134 line1 = "".join(dna_array)
135 line2 = comparison
136 line3 = "".join(est_array)
137
138 return line1,line2,line3
139
140
141 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
142 return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
143
144
145 ##########
146
147
148 def split_file_join_results(filename,parts):
149 all_intervals = []
150
151 print 'counting lines'
152 line_ctr = 0
153 for line in open(filename,'r'):
154 line_ctr += 1
155
156 part = line_ctr / parts
157 begin = 0
158 end = 0
159 for idx in range(1,parts+1):
160
161 if idx == parts:
162 begin = end
163 end = line_ctr
164 else:
165 begin = end
166 end = begin+part
167
168 params = (begin,end)
169
170 all_intervals.append(params)
171
172 infile = open(filename,'r')
173
174 parts_fn = []
175 for pos,params in enumerate(all_intervals):
176 beg,end = params
177 print params
178 out_fn = '%s.part_%d'%(filename,pos)
179 parts_fn.append(out_fn)
180 #out_fh = open(out_fn,'w+')
181 print out_fn
182
183 parts_fn.reverse()
184 all_intervals.reverse()
185
186 lineCtr = 0
187 beg = -1
188 end = -1
189 for line in open(filename,'r'):
190 if beg <= lineCtr < end:
191 out_fh.write(line)
192 lineCtr += 1
193 else:
194 params = all_intervals.pop()
195 beg,end = params
196 out_fn = parts_fn.pop()
197 out_fh = open(out_fn,'w+')
198
199 out_fh.close()
200
201
202