+ added read structure for new parser
[qpalma.git] / qpalma / DataProc.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7 import Configuration as Conf
8 import tools
9 from tools.PyGff import *
10 import io_pickle
11 import scipy.io
12 import sys
13
14 from genome_utils import load_genomic
15
16 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
17 # expt can be 'training','validation' or 'test'
18 assert expt in ['training','validation','test']
19
20 dna_filename = Conf.dna_filename
21 est_filename = Conf.est_filename
22
23 pdb.set_trace()
24
25 allGenes = cPickle.load(open(dna_filename))
26
27 Sequences = []
28 Acceptors = []
29 Donors = []
30 Exons = []
31 Ests = []
32 Qualities = []
33 SplitPositions = []
34
35 # Iterate over all reads
36 for line in open(est_filename):
37 line = line.strip()
38 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
39 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
40 id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
41
42 splitpos = int(splitpos)
43 length = int(length)
44 prb = [ord(elem)-50 for elem in prb]
45 cal = [ord(elem)-64 for elem in cal]
46 chastity = [ord(elem)+10 for elem in chastity]
47
48 assert len(prb) == len(seq)
49
50 currentGene = allGenes[gene_id]
51 seq = seq.lower()
52
53 #try:
54 # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
55 #except:
56 # continue
57
58 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
59 chrom = 'chr1'
60
61 # use matlab-style functions to access dna sequence
62 dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
63 currentSeq = dna
64
65 if len(currentSeq) < (currentGene.stop - currentGene.start):
66 continue
67
68 # compare both sequences
69 #assert dna == currentSeq, pdb.set_trace()
70
71 p_start = int(p_start)
72 exon_stop = int(exon_stop)
73 exon_start = int(exon_start)
74 p_stop = int(p_stop)
75
76 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
77 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
78 assert p_stop - p_start >= 36
79
80 currentExons = zeros((2,2))
81
82 currentExons[0,0] = p_start
83 currentExons[0,1] = exon_stop
84
85 currentExons[1,0] = exon_start
86 currentExons[1,1] = p_stop
87
88 # make indices relative to start pos of the current gene
89 currentExons -= currentGene.start
90
91 # determine cutting positions for sequences
92 up_cut = int(currentExons[0,0])
93 down_cut = int(currentExons[1,1])
94
95 # if we perform testing we cut a much wider region as we want to detect
96 # how good we perform on a region
97 if test:
98 up_cut = up_cut-500
99 if up_cut < 0:
100 up_cut = 0
101
102 down_cut = down_cut+500
103
104 if down_cut > len(currentSeq):
105 down_cut = len(currentSeq)
106
107 # cut out part of the sequence
108 currentSeq = currentSeq[up_cut:down_cut]
109
110 # ensure that index is correct
111 currentExons[0,1] += 1
112
113 if test:
114 currentExons -= up_cut
115 else:
116 currentExons -= currentExons[0,0]
117
118 try:
119 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
120 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
121 continue
122
123 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
124 continue
125
126 except:
127 pdb.set_trace()
128
129 Sequences.append(currentSeq)
130
131 # now we want to use interval_query to get the predicted splice scores
132 # trained on the TAIR sequence and annotation
133
134 from Genefinding import *
135
136 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
137 num_fields = 1
138 num_intervals = 1
139
140 # fetch acceptor scores
141 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
142 gf = Genef()
143 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
144 assert num_hits <= len(currentSeq)
145
146 #print 'Acceptor hits: %d' % num_hits
147
148 pos = createIntArrayFromList([0]*num_hits)
149 indices = createIntArrayFromList([0]*num_hits)
150 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
151 gf.getResults(pos,indices,scores)
152
153 currentAcceptors = [-inf]*(len(currentSeq)+1)
154
155 for i in range(num_hits):
156 position = pos[i]
157 position -= (currentGene.start+up_cut)
158 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
159 currentAcceptors[position] = scores[i]
160
161 currentAcceptors = currentAcceptors[1:] + [-inf]
162 Acceptors.append(currentAcceptors)
163
164 del gf
165
166 # fetch donor scores
167 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
168 gf = Genef()
169 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
170 assert num_hits <= len(currentSeq)
171
172 #print 'Donor hits: %d' % num_hits
173 pos = createIntArrayFromList([0]*num_hits)
174 indices = createIntArrayFromList([0]*num_hits)
175 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
176 gf.getResults(pos,indices,scores)
177
178 currentDonors = [-inf]*(len(currentSeq))
179
180 try:
181 for i in range(1,num_hits):
182 position = pos[i]
183 position -= (currentGene.start+up_cut)
184 currentDonors[position-1] = scores[i]
185 except:
186 pdb.set_trace()
187
188 currentDonors = [-inf] + currentDonors[:-1]
189 Donors.append(currentDonors)
190
191 Exons.append(currentExons)
192 Ests.append(seq)
193 Qualities.append(prb)
194
195 SplitPositions.append(splitpos)
196
197 #if len(Sequences[-1]) == 2755:
198 # Sequences = Sequences[:-1]
199 # Acceptors = Acceptors[:-1]
200 # Donors = Donors[:-1]
201 # Exons = Exons[:-1]
202 # Ests = Ests[:-1]
203 # Qualities = Qualities[:-1]
204 # SplitPositions = SplitPositions[:-1]
205
206 #print 'found %d examples' % len(Sequences)
207
208 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
209
210
211 def paths_load_data_pickle(expt,genome_info,PAR):
212 """
213
214 """
215
216 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
217 # Load the relevant file and return the alignment data
218
219 # expt can be 'training','validation' or 'test'
220
221 assert expt in ['training','validation','test']
222
223 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
224
225 Noises = [];
226
227 if expt == 'training':
228 if PAR.microexon:
229 if PAR.LOCAL_ALIGN: # local version
230
231 train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
232 data = cPickle.load(open(train_data))
233
234 else: # global version
235 pass
236
237
238 else:
239 train_data = '%s/exons_train_local.pickle' % genome_info.basedir
240 data = cPickle.load(open(train_data))
241
242 #print 'train_data is %s' % train_data
243
244 Sequences = data['Train'] # dna sequences
245 Acceptors = data['TrainAcc'] # acceptor scores
246 Donors = data['TrainDon'] # donor scores
247 Exons = data['TrainExon'] # exon boundaries
248 Ests = data['TrainEsts'] # est sequences
249
250 # Lower all indices by one to convert matlab
251 # to python indices
252
253 Exons -= 1
254
255 return Sequences, Acceptors, Donors, Exons, Ests, Noises
256
257 def paths_load_data(expt,genome_info,PAR):
258 """
259
260 """
261
262 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
263 # Load the relevant file and return the alignment data
264
265 # expt can be 'training','validation' or 'test'
266
267 assert expt in ['training','validation','test']
268
269 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
270
271 Noises = [];
272
273 if expt == 'training':
274 if PAR.microexon:
275 if PAR.LOCAL_ALIGN: # local version
276
277 train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
278 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
279 #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
280 #io_pickle.convert_v6(train_data,train_data_pickle)
281 #train_data = io_pickle.load(train_data_pickle)
282 data = scipy.io.loadmat(train_data)
283
284 else: # global version
285
286 train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
287 (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
288
289 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
290 #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
291 # (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
292
293 #io_pickle.convert_v6(train_data,train_data_pickle)
294 #train_data = io_pickle.load(train_data_pickle)
295 data = scipy.io.loadmat(train_data)
296 Noises = data['TrainNoise'] # substitution matrix
297
298 else:
299 train_data = '%s/exons_train_local.mat' % genome_info.basedir
300 #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
301 #io_pickle.convert_v6(train_data,train_data_pickle)
302 #microexon_train_data = io_pickle.load(train_data_pickle)
303 data = scipy.io.loadmat(train_data)
304
305 #print 'train_data is %s' % train_data
306
307 Sequences = data['Train'] # dna sequences
308 Acceptors = data['TrainAcc'] # acceptor scores
309 Donors = data['TrainDon'] # donor scores
310 Exons = data['TrainExon'] # exon boundaries
311 Ests = data['TrainEsts'] # est sequences
312
313 #elif expt == 'validation':
314 # print('Loading validation data\n') ;
315 # if PAR.microexon,
316 # if PAR.LOCAL_ALIGN
317 # %local version
318 # load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
319 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
320 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
321 # else
322 # %global version
323 # load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
324 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
325 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
326 # end
327 # else
328 # load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
329 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
330 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
331 # end
332 #
333 # Sequences = Val ; % dna sequences
334 # Acceptors = ValAcc ; % acceptor scores
335 # Donors = ValDon ; % donor scores
336 # Exons = ValExon ; % exon boundaries
337 # Ests = ValEsts ; % est sequences
338
339
340
341 #elif expt == 'test':
342 # fprintf('Loading test data\n') ;
343 # if PAR.microexon,
344 # if PAR.LOCAL_ALIGN
345 # %local version
346 # load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
347 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
348 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
349 # else
350 # %global version
351 # load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
352 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
353 # 'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
354 # Noises = TestNoise ; % substitution matrix
355 # end
356 # else
357 # load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
358 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
359 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
360 # end
361 #
362 # Sequences = Test ; % dna sequences
363 # Acceptors = TestAcc ; % acceptor scores
364 # Donors = TestDon ; % donor scores
365 # Exons = TestExon ; % exon boundaries
366 # Ests = TestEsts ; % est sequences
367
368 # Lower all indices by one to convert matlab
369 # to python indices
370
371 Exons -= 1
372
373 return Sequences, Acceptors, Donors, Exons, Ests, Noises