1 # This program is free software; you can redistribute it and/or modify
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
9 import pdb
10 import sys
11 import os.path
13 from numpy.matlib import mat,zeros,ones,inf
15 import palma_utils as pu
17 # some useful constants
19 extended_alphabet = ['-','a','c','g','t','n','[',']']
20 alphabet = ['-','a','c','g','t','n']
22 alignment_alphabet = ['-','a','c','g','t','n','z']
25 def print_prediction(example):
26 dna_array = example['dna_array']
29 dna = map(lambda x: alignment_alphabet[x],dna_array)
32 spliceAlign = example['spliceAlign']
33 estAlign = example['estAlign']
35 line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
37 result = '%s\n%s\n%s' %(line1,line2,line3)
38 return result
40 def calc_info(acc,don,exons,qualities):
41 min_score = 100
42 max_score = -100
44 for elem in acc:
45 for score in elem:
46 if score != -inf:
47 min_score = min(min_score,score)
48 max_score = max(max_score,score)
50 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
52 min_score = 100
53 max_score = -100
55 for elem in don:
56 for score in elem:
57 if score != -inf:
58 min_score = min(min_score,score)
59 max_score = max(max_score,score)
61 print 'Donor max/min: %f / %f' % (max_score,min_score)
63 min_score = 10000
64 max_score = 0
65 mean_intron_len = 0
67 for elem in exons:
68 intron_len = int(elem[1,0] - elem[0,1])
69 mean_intron_len += intron_len
70 min_score = min(min_score,intron_len)
71 max_score = max(max_score,intron_len)
73 mean_intron_len /= 1.0*len(exons)
74 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
76 min_score = 10000
77 max_score = 0
78 mean_quality = 0
80 for qVec in qualities:
81 for elem in qVec:
82 min_score = min(min_score,elem)
83 max_score = max(max_score,elem)
85 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
88 def calc_stat(Acceptor,Donor,Exons):
89 maxAcc = -100
90 minAcc = 100
91 maxDon = -100
92 minDon = 100
94 acc_vec_pos = []
95 acc_vec_neg = []
96 don_vec_pos = []
97 don_vec_neg = []
99 for jdx in range(len(Acceptors)):
100 currentExons = Exons[jdx]
101 currentAcceptor = Acceptors[jdx]
102 currentAcceptor = currentAcceptor[1:]
103 currentAcceptor.append(-inf)
104 currentDonor = Donors[jdx]
106 for idx,elem in enumerate(currentAcceptor):
107 if idx == (int(currentExons[1,0])-1): # acceptor site
108 acc_vec_pos.append(elem)
109 else:
110 acc_vec_neg.append(elem)
112 for idx,elem in enumerate(currentDonor):
113 if idx == (int(currentExons[0,1])): # donor site
114 don_vec_pos.append(elem)
115 else:
116 don_vec_neg.append(elem)
118 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
119 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
120 don_pos = [elem for elem in don_vec_pos if elem != -inf]
121 don_neg = [elem for elem in don_vec_neg if elem != -inf]
123 for idx in range(len(Sequences)):
124 acc = [elem for elem in Acceptors[idx] if elem != -inf]
125 maxAcc = max(max(acc),maxAcc)
126 minAcc = min(min(acc),minAcc)
128 don = [elem for elem in Donors[idx] if elem != -inf]
129 maxDon = max(max(don),maxDon)
130 minDon = min(min(don),minDon)
134 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
137 t_strand = '+'
138 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
139 #(gap_char, 5 nucleotides, intron_char)
140 comparison_char = '| ' #first: match_char, second: intron_char
142 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
143 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
145 first_identity = None
146 last_identity = None
147 for i in range(len(comparison)):
148 if comparison[i] == '|' and first_identity is None:
149 first_identity = i
150 if comparison[-i] == '|' and last_identity is None:
151 last_identity = len(comparison) - i - 1
153 try:
154 for idx in range(len(dna_array)):
155 dna_array[idx] = translation[int(dna_array[idx])]
156 est_array[idx] = translation[int(est_array[idx])]
157 except:
158 #pdb.set_trace()
159 pass
161 line1 = "".join(dna_array)
162 line2 = comparison
163 line3 = "".join(est_array)
165 return line1,line2,line3
172 ##########
174 def combine_files(partial_files,new_file):
175 try:
176 fh = open(new_file,'w')
177 except IOError:
178 print 'Problem while trying to open file: %s' % new_file
179 sys.exit(1)
181 for pfile in partial_files:
182 for line in open(pfile):
183 fh.write(line)
185 fh.close()
188 def split_file(filename,result_dir,num_parts):
189 """
190 Splits a file of n lines into num_parts parts
191 """
193 jp = os.path.join
195 all_intervals = []
197 print 'counting lines'
198 line_ctr = 0
199 for line in open(filename,'r'):
200 line_ctr += 1
202 print 'found %d lines' % line_ctr
204 part_size = line_ctr / num_parts
205 begin = 0
206 end = 0
208 for idx in range(1,num_parts+1):
209 begin = end
211 if idx == num_parts:
212 end = line_ctr
213 else:
214 end = begin+part_size+1
216 all_intervals.append((begin,end))
218 parts_fn = []
219 for pos,params in enumerate(all_intervals):
220 beg,end = params
221 out_fn = jp(result_dir,'map.part_%d'%pos)
222 print out_fn
223 parts_fn.append(out_fn)
224 out_fh = open(out_fn,'w+')
226 parts_fn.reverse()
227 all_intervals.reverse()
229 lineCtr = 0
230 beg = -1
231 end = -1
232 in_fh = open(filename,'r')
233 while True:
235 if line == '':
236 break
238 if beg <= lineCtr < end:
239 out_fh.write(line)
240 lineCtr += 1
241 else:
242 (beg,end) = all_intervals.pop()
243 out_fn = parts_fn.pop()
244 out_fh.close()
245 out_fh = open(out_fn,'w+')
246 out_fh.write(line)
248 out_fh.close()
251 def get_slices(dataset_size,num_nodes):
252 """
253 Given the size of the dataset and the number of nodes which shall be used
254 this function returns the slice of the dataset for each node.
255 """
257 all_instances = []
259 part = dataset_size / num_nodes
260 begin = 0
261 end = 0
262 for idx in range(1,num_nodes+1):
264 if idx == num_nodes:
265 begin = end
266 end = dataset_size
267 else:
268 begin = end
269 end = begin+part
271 params = (begin,end)
273 all_instances.append(params)
275 return all_instances
278 def logwrite(mes,settings):
279 """
280 A wrapper to write message to the global logfile.
281 """
282 fh = open(settings['global_log_fn'],'a')
283 fh.write(mes)
286 if __name__ == '__main__':
287 split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)