+ extended evaluation function to return positionwise deviations
[qpalma.git] / qpalma / DataProc.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7 import Configuration as Conf
8 from PyGff import *
9 import io_pickle
10 import scipy.io
11 import sys
12
13 from genome_utils import load_genomic
14
15 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
16 # expt can be 'training','validation' or 'test'
17 assert expt in ['training','validation','test']
18
19 dna_filename = Conf.dna_filename
20 est_filename = Conf.est_filename
21
22 #tair7_seq_filename = Conf.tair7_seq_filename
23 #tair7_seq = cPickle.load(open(tair7_seq_filename))
24
25 allGenes = cPickle.load(open(dna_filename))
26
27 Sequences = []
28 Acceptors = []
29 Donors = []
30 Exons = []
31 Ests = []
32 Qualities = []
33 SplitPositions = []
34
35 # Iterate over all reads
36 for line in open(est_filename):
37 line = line.strip()
38 chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
39 splitpos = int(splitpos)
40 length = int(length)
41 prb = [ord(elem)-50 for elem in prb]
42 cal = [ord(elem)-64 for elem in cal]
43 chastity = [ord(elem)+10 for elem in chastity]
44
45 assert len(prb) == len(seq)
46
47 currentGene = allGenes[gene_id]
48 seq = seq.lower()
49
50 #try:
51 # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
52 #except:
53 # continue
54
55 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
56 chrom = 'chr1'
57
58 # use matlab-style functions to access dna sequence
59 dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
60 currentSeq = dna
61
62 if len(currentSeq) < (currentGene.stop - currentGene.start):
63 continue
64
65 # compare both sequences
66 #assert dna == currentSeq, pdb.set_trace()
67
68 p_start = int(p_start)
69 exon_stop = int(exon_stop)
70 exon_start = int(exon_start)
71 p_stop = int(p_stop)
72
73 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
74 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
75 assert p_stop - p_start >= 36
76
77 currentExons = zeros((2,2))
78
79 currentExons[0,0] = p_start
80 currentExons[0,1] = exon_stop
81
82 currentExons[1,0] = exon_start
83 currentExons[1,1] = p_stop
84
85 # make indices relative to start pos of the current gene
86 currentExons -= currentGene.start
87
88 # determine cutting positions for sequences
89 up_cut = int(currentExons[0,0])
90 down_cut = int(currentExons[1,1])
91
92 # if we perform testing we cut a much wider region as we want to detect
93 # how good we perform on a region
94 if test:
95 up_cut = up_cut-500
96 if up_cut < 0:
97 up_cut = 0
98
99 down_cut = down_cut+500
100
101 if down_cut > len(currentSeq):
102 down_cut = len(currentSeq)
103
104 # cut out part of the sequence
105 currentSeq = currentSeq[up_cut:down_cut]
106
107 # ensure that index is correct
108 currentExons[0,1] += 1
109
110 if test:
111 currentExons -= up_cut
112 else:
113 currentExons -= currentExons[0,0]
114
115 try:
116 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
117 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
118 continue
119
120 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
121 continue
122
123 except:
124 pdb.set_trace()
125
126 Sequences.append(currentSeq)
127
128 # now we want to use interval_query to get the predicted splice scores
129 # trained on the TAIR sequence and annotation
130
131 from Genefinding import *
132
133 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
134 num_fields = 1
135 num_intervals = 1
136
137 # fetch acceptor scores
138 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
139 gf = Genef()
140 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
141 assert num_hits <= len(currentSeq)
142
143 #print 'Acceptor hits: %d' % num_hits
144
145 pos = createIntArrayFromList([0]*num_hits)
146 indices = createIntArrayFromList([0]*num_hits)
147 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
148 gf.getResults(pos,indices,scores)
149
150 currentAcceptors = [-inf]*(len(currentSeq)+1)
151
152 for i in range(num_hits):
153 position = pos[i]
154 position -= (currentGene.start+up_cut)
155 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
156 currentAcceptors[position] = scores[i]
157
158 currentAcceptors = currentAcceptors[1:] + [-inf]
159 Acceptors.append(currentAcceptors)
160
161 del gf
162
163 # fetch donor scores
164 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
165 gf = Genef()
166 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
167 assert num_hits <= len(currentSeq)
168
169 #print 'Donor hits: %d' % num_hits
170 pos = createIntArrayFromList([0]*num_hits)
171 indices = createIntArrayFromList([0]*num_hits)
172 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
173 gf.getResults(pos,indices,scores)
174
175 currentDonors = [-inf]*(len(currentSeq))
176
177 try:
178 for i in range(1,num_hits):
179 position = pos[i]
180 position -= (currentGene.start+up_cut)
181 currentDonors[position-1] = scores[i]
182 except:
183 pdb.set_trace()
184
185 currentDonors = [-inf] + currentDonors[:-1]
186 Donors.append(currentDonors)
187
188 Exons.append(currentExons)
189 Ests.append(seq)
190 Qualities.append(prb)
191
192 SplitPositions.append(splitpos)
193
194 #if len(Sequences[-1]) == 2755:
195 # Sequences = Sequences[:-1]
196 # Acceptors = Acceptors[:-1]
197 # Donors = Donors[:-1]
198 # Exons = Exons[:-1]
199 # Ests = Ests[:-1]
200 # Qualities = Qualities[:-1]
201 # SplitPositions = SplitPositions[:-1]
202
203 #print 'found %d examples' % len(Sequences)
204
205 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
206
207
208 def paths_load_data_pickle(expt,genome_info,PAR):
209 """
210
211 """
212
213 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
214 # Load the relevant file and return the alignment data
215
216 # expt can be 'training','validation' or 'test'
217
218 assert expt in ['training','validation','test']
219
220 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
221
222 Noises = [];
223
224 if expt == 'training':
225 if PAR.microexon:
226 if PAR.LOCAL_ALIGN: # local version
227
228 train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
229 data = cPickle.load(open(train_data))
230
231 else: # global version
232 pass
233
234
235 else:
236 train_data = '%s/exons_train_local.pickle' % genome_info.basedir
237 data = cPickle.load(open(train_data))
238
239 #print 'train_data is %s' % train_data
240
241 Sequences = data['Train'] # dna sequences
242 Acceptors = data['TrainAcc'] # acceptor scores
243 Donors = data['TrainDon'] # donor scores
244 Exons = data['TrainExon'] # exon boundaries
245 Ests = data['TrainEsts'] # est sequences
246
247 # Lower all indices by one to convert matlab
248 # to python indices
249
250 Exons -= 1
251
252 return Sequences, Acceptors, Donors, Exons, Ests, Noises
253
254 def paths_load_data(expt,genome_info,PAR):
255 """
256
257 """
258
259 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
260 # Load the relevant file and return the alignment data
261
262 # expt can be 'training','validation' or 'test'
263
264 assert expt in ['training','validation','test']
265
266 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
267
268 Noises = [];
269
270 if expt == 'training':
271 if PAR.microexon:
272 if PAR.LOCAL_ALIGN: # local version
273
274 train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
275 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
276 #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
277 #io_pickle.convert_v6(train_data,train_data_pickle)
278 #train_data = io_pickle.load(train_data_pickle)
279 data = scipy.io.loadmat(train_data)
280
281 else: # global version
282
283 train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
284 (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
285
286 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
287 #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
288 # (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
289
290 #io_pickle.convert_v6(train_data,train_data_pickle)
291 #train_data = io_pickle.load(train_data_pickle)
292 data = scipy.io.loadmat(train_data)
293 Noises = data['TrainNoise'] # substitution matrix
294
295 else:
296 train_data = '%s/exons_train_local.mat' % genome_info.basedir
297 #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
298 #io_pickle.convert_v6(train_data,train_data_pickle)
299 #microexon_train_data = io_pickle.load(train_data_pickle)
300 data = scipy.io.loadmat(train_data)
301
302 #print 'train_data is %s' % train_data
303
304 Sequences = data['Train'] # dna sequences
305 Acceptors = data['TrainAcc'] # acceptor scores
306 Donors = data['TrainDon'] # donor scores
307 Exons = data['TrainExon'] # exon boundaries
308 Ests = data['TrainEsts'] # est sequences
309
310 #elif expt == 'validation':
311 # print('Loading validation data\n') ;
312 # if PAR.microexon,
313 # if PAR.LOCAL_ALIGN
314 # %local version
315 # load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
316 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
317 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
318 # else
319 # %global version
320 # load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
321 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
322 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
323 # end
324 # else
325 # load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
326 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
327 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
328 # end
329 #
330 # Sequences = Val ; % dna sequences
331 # Acceptors = ValAcc ; % acceptor scores
332 # Donors = ValDon ; % donor scores
333 # Exons = ValExon ; % exon boundaries
334 # Ests = ValEsts ; % est sequences
335
336
337
338 #elif expt == 'test':
339 # fprintf('Loading test data\n') ;
340 # if PAR.microexon,
341 # if PAR.LOCAL_ALIGN
342 # %local version
343 # load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
344 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
345 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
346 # else
347 # %global version
348 # load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
349 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
350 # 'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
351 # Noises = TestNoise ; % substitution matrix
352 # end
353 # else
354 # load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
355 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
356 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
357 # end
358 #
359 # Sequences = Test ; % dna sequences
360 # Acceptors = TestAcc ; % acceptor scores
361 # Donors = TestDon ; % donor scores
362 # Exons = TestExon ; % exon boundaries
363 # Ests = TestEsts ; % est sequences
364
365 # Lower all indices by one to convert matlab
366 # to python indices
367
368 Exons -= 1
369
370 return Sequences, Acceptors, Donors, Exons, Ests, Noises