+ moved qpalma dataset generation scripts
[qpalma.git] / tools / qpalma_dataset_generation / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import cPickle
10
11 import numpy
12 from numpy.matlib import mat,zeros,ones,inf
13
14 import qpalma
15 import qpalma.tools
16
17 from qpalma.parsers import *
18
19 from Genefinding import *
20
21 from genome_utils import load_genomic
22
23 import qpalma.Configuration as Conf
24
25
26 class DatasetGenerator:
27 """
28
29 """
30
31 def __init__(self,filtered_reads,map_file,map_2nd_file):
32 assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
33 self.filtered_reads = filtered_reads
34
35 assert os.path.exists(map_file), 'Error: Can not find map file'
36 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
37 self.map_file = map_file
38 self.map_2nd_file = map_2nd_file
39
40 self.training_set = []
41 self.testing_set = []
42
43 print 'parsing filtered reads..'
44 self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
45 print 'found %d filtered reads' % len(self.all_filtered_reads)
46
47
48 def saveAs(self,dataset_file):
49 assert not os.path.exists(dataset_file), 'The data_file already exists!'
50
51 # saving new datasets
52
53 #all_keys = self.training_set.keys()
54 #random.shuffle(all_keys)
55 #training_keys = all_keys[0:10000]
56 #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
57 #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
58
59 cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
60
61 prediction_keys = [0]*len(self.testing_set.keys())
62 for pos,key in enumerate(self.testing_set.keys()):
63 prediction_keys[pos] = key
64
65 cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
66
67
68 def compile_training_set(self):
69 # this stores the new dataset
70 dataset = {}
71
72 # Iterate over all remapped reads in order to generate for each read a
73 # training / prediction example
74 instance_counter = 1
75 skipped_ctr = 0
76
77 for id,filteredRead in self.all_filtered_reads.items():
78 if instance_counter % 1001 == 0:
79 print 'processed %d examples' % instance_counter
80
81 # training set consists only of spliced reads
82 if not id < 1000000300000:
83 continue
84
85 if filteredRead['strand'] != '+':
86 skipped_ctr += 1
87 continue
88
89 if not filteredRead['chr'] in range(1,6):
90 skipped_ctr += 1
91 continue
92
93 # we cut out the genomic region for training
94 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
95 genomicSeq_start = filteredRead['p_start'] - CUT_OFFSET
96 genomicSeq_stop = filteredRead['p_stop'] + CUT_OFFSET
97
98 # information of the full read we need for training
99 chromo = filteredRead['chr']
100 strand = filteredRead['strand']
101 original_est = filteredRead['seq']
102 quality = filteredRead['prb']
103 cal_prb = filteredRead['cal_prb']
104 chastity = filteredRead['chastity']
105
106 currentExons = zeros((2,2),dtype=numpy.int)
107 currentExons[0,0] = filteredRead['p_start']
108 currentExons[0,1] = filteredRead['exon_stop']
109 currentExons[1,0] = filteredRead['exon_start']
110 currentExons[1,1] = filteredRead['p_stop']
111
112 # add instance to set
113 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
114 currentQualities = (quality,cal_prb,chastity)
115
116 dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
117
118 instance_counter += 1
119
120 print 'Full dataset has size %d' % len(dataset)
121 print 'Skipped %d reads' % skipped_ctr
122
123 self.training_set = dataset
124
125
126 def parse_map_file(self,dataset,map_file):
127 strand_map = ['-','+']
128 instance_counter = 1
129
130 for line in open(map_file):
131 if instance_counter % 1001 == 0:
132 print 'processed %d examples' % instance_counter
133
134 line = line.strip()
135 slist = line.split()
136 id = int(slist[0])
137 chromo = int(slist[1])
138 pos = int(slist[2])
139 strand = slist[3]
140 strand = strand_map[strand == 'D']
141
142 if strand != '+':
143 continue
144
145 if not chromo in range(1,6):
146 continue
147
148 genomicSeq_start = pos - 1500
149 genomicSeq_stop = pos + 1500
150
151 # fetch missing information from original reads
152 filteredRead = self.all_filtered_reads[id]
153 original_est = filteredRead['seq']
154 original_est = original_est.lower()
155
156 original_est = filteredRead['seq']
157 prb = filteredRead['prb']
158 cal_prb = filteredRead['cal_prb']
159 chastity = filteredRead['chastity']
160
161 # add instance to set
162 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
163 currentQualities = (prb,cal_prb,chastity)
164
165 # as one single read can have several vmatch matches we store all
166 # these matches under the unique id of the read
167 if dataset.has_key(id):
168 old_entry = dataset[id]
169 old_entry.append((currentSeqInfo,original_est,currentQualities))
170 dataset[id] = old_entry
171 else:
172 dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
173
174 instance_counter += 1
175
176 print 'Total examples processed: %d' % instance_counter
177
178 return dataset
179
180
181 def compile_testing_set(self):
182
183 dataset = {}
184
185 # usually we have two files to parse:
186 # the map file from the second run and a subset of the map file from the
187 # first run
188 dataset = self.parse_map_file(dataset,self.map_file)
189 dataset = self.parse_map_file(dataset,self.map_2nd_file)
190
191 # store the full set
192 self.testing_set = dataset
193
194
195 def compile_dataset_direct(filtered_reads,dataset_file):
196
197 strand_map = ['-','+']
198
199 SeqInfo = []
200 OriginalEsts = []
201 Qualities = []
202
203 instance_counter = 0
204
205 for line in open(filtered_reads):
206 line = line.strip()
207 slist = line.split()
208 id = int(slist[0])
209
210 if not id < 1000000300000:
211 continue
212
213 if instance_counter % 1000 == 0:
214 print 'processed %d examples' % instance_counter
215
216 chr = int(slist[1])
217 strand = slist[2]
218 strand = strand_map[strand == 'D']
219
220 genomicSeq_start = int(slist[10]) - 1000
221 genomicSeq_stop = int(slist[13] ) + 1000
222
223 original_est = slist[3]
224 original_est = original_est.lower()
225 #print original_est
226
227 prb = [ord(elem)-50 for elem in slist[6]]
228 cal_prb = [ord(elem)-64 for elem in slist[7]]
229 chastity = [ord(elem)+10 for elem in slist[8]]
230
231 #pdb.set_trace()
232
233 # add instance to set
234 SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
235 OriginalEsts.append(original_est)
236 Qualities.append( (prb,cal_prb,chastity) )
237
238 instance_counter += 1
239
240 dataset = [SeqInfo, OriginalEsts, Qualities ]
241
242 # saving new dataset
243 cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
244
245
246
247
248 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
249 """
250 Now we want to use interval_query to get the predicted splice scores trained
251 on the TAIR sequence and annotation.
252 """
253
254 size = intervalEnd-intervalBegin
255 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
256
257 acc = size*[0.0]
258 don = size*[0.0]
259
260 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
261 pos_size = new_intp()
262 intp_assign(pos_size,1)
263
264 # fetch acceptor scores
265 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
266 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
267
268 # fetch donor scores
269 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
270 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
271
272 return acc, don
273
274
275 def process_map(currentRRead,fRead):
276 """
277 For all matches found by Vmatch we calculate the fragment of the DNA we
278 would like to perform an aligment during prediction.
279 """
280
281 fragment_size = 3000
282
283 alternativeSeq = []
284
285 onePositiveLabel = False
286
287 rId = currentRRead['id']
288 pos = currentRRead['pos']
289 chr = currentRRead['chr']
290 strand = currentRRead['strand']
291
292 length = currentRRead['length']
293 offset = currentRRead['offset']
294
295 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
296
297 # vmatch found the correct position
298 if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
299 genomicSeq_start = fRead['p_start'] - CUT_OFFSET
300 genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
301 else:
302 genomicSeq_start = pos - (fragment_size/2)
303 genomicSeq_stop = pos + (fragment_size/2)
304
305 return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
306
307
308 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
309 """
310 This function expects an interval, chromosome and strand information and
311 returns then the genomic sequence of this interval and the associated scores.
312 """
313
314 assert genomicSeq_start < genomicSeq_stop
315
316 chrom = 'chr%d' % chr
317 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
318 genomicSeq = genomicSeq.lower()
319
320 # check the obtained dna sequence
321 assert genomicSeq != '', 'load_genomic returned empty sequence!'
322
323 # all entries other than a c g t are set to n
324 no_base = re.compile('[^acgt]')
325 genomicSeq = no_base.sub('n',genomicSeq)
326
327 intervalBegin = genomicSeq_start-100
328 intervalEnd = genomicSeq_stop+100
329 seq_pos_offset = genomicSeq_start
330
331 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
332
333 currentAcc = currentAcc[100:-98]
334 currentAcc = currentAcc[1:]
335 currentDon = currentDon[100:-100]
336
337 length = len(genomicSeq)
338 currentAcc = currentAcc[:length]
339
340 currentDon = currentDon+[-inf]*(length-len(currentDon))
341
342 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
343 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
344
345 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
346 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
347 assert len(currentAcc) == len(currentDon)
348
349 return genomicSeq, currentAcc, currentDon
350
351
352 def reverse_complement(seq):
353 map = {'a':'t','c':'g','g':'c','t':'a'}
354
355 new_seq = [map[elem] for elem in seq]
356 new_seq.reverse()
357 new_seq = "".join(new_seq)
358
359 return new_seq
360
361
362 def get_seq(begin,end,exon_end):
363 """
364 """
365
366 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
367
368 if exon_end:
369 gene_start = begin
370 gene_stop = end+2
371 else:
372 gene_start = begin-2
373 gene_stop = end
374
375 chrom = 'chr%d' % 1
376 strand = '+'
377
378 genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
379 genomicSeq = genomicSeq.lower()
380
381 return genomicSeq
382
383
384 def parseLine(line):
385 """
386 We assume that a line has the following entries:
387
388 read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
389
390 """
391 #try:
392 id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
393 #except:
394 # id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
395 # true_cut = -1
396
397 splitpos = int(splitpos)
398 read_size = int(read_size)
399
400 seq=seq.lower()
401
402 assert strand in ['D','P']
403
404 if strand == 'D':
405 strand = '+'
406
407 if strand == 'P':
408 strand = '-'
409
410 chr = int(chr)
411
412 prb = [ord(elem)-50 for elem in prb]
413 cal_prb = [ord(elem)-64 for elem in cal_prb]
414 chastity = [ord(elem)+10 for elem in chastity]
415
416 p_start = int(p_start)
417 exon_stop = int(exon_stop)
418 exon_start = int(exon_start)
419 p_stop = int(p_stop)
420 true_cut = int(true_cut)
421
422 line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
423 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
424 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
425 'p_stop':p_stop,'true_cut':true_cut}
426
427 return line_d
428
429
430 if __name__ == '__main__':
431 if len(sys.argv) == 1:
432 print info
433
434 assert len(sys.argv) == 6, help
435 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)