+ changed filter behavior (tendency towards smaller read fragments)
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import pdb
7 import io_pickle
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11
12 import qpalma
13 import qpalma.tools
14 from qpalma.tools.parseGff import parse_gff
15
16 from qpalma.parsers import FilteredReadParser, RemappedReadParser
17
18 from Genefinding import *
19
20 from genome_utils import load_genomic
21
22 import qpalma.Configuration as Conf
23
24 warning = """
25 #########################################################
26 WARNING ! Disabled the use of non-bracket reads WARNING !
27 #########################################################
28
29 """
30
31
32 help = """
33
34 Usage of this script is:
35
36 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
37
38 """
39
40 info = """
41 The purpose of this script is to compile a full data set for QPalma out of the
42 following files:
43
44 - The information of the gff (gene annotation).
45
46 - The flat dna files with the sequence of the respective gene positions
47
48 - The solexa reads in the filtered version
49
50 - The solexa reads in the remapped version
51
52 This represents a dataset for use with QPalma
53 The entries are def'd as follows:
54
55 - 'id' a unique id for the read
56 - 'chr' the chromosome identifier
57 - 'strand' the identifier for the strand either '+' or '-'
58
59 - 'read_start'
60 - 'exon_stop'
61 - 'exon_start'
62 - 'read_stop'
63
64 - 'remappped_pos' - the position of the remapped read
65 """
66
67 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
68
69 assert os.path.exists(filtered_reads)
70 assert os.path.exists(remapped_reads)
71
72 assert not os.path.exists(dataset_file), 'The data_file already exists!'
73
74 joinp = os.path.join
75
76 # first read the gff file(s) and create pickled gene information
77 allGenes = [None]*6
78
79 for i in range(1,6):
80 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
81
82 print 'parsing filtered reads..'
83 frp = FilteredReadParser(filtered_reads)
84 all_filtered_reads = frp.parse()
85 print 'found %d filtered reads' % len(all_filtered_reads)
86
87 print 'parsing remapped reads'
88 rrp = RemappedReadParser(remapped_reads)
89 all_remapped_reads = rrp.parse()
90 if all_remapped_reads != None:
91 size_rem = len(all_remapped_reads)
92 else:
93 size_rem = 0
94
95 print 'found %d remapped reads' % size_rem
96
97 # this stores the new dataset
98 Sequences = []
99 Acceptors = []
100 Donors = []
101 Exons = []
102 Ests = []
103 OriginalEsts = []
104 Qualities = []
105 SplitPositions = []
106 Ids = []
107
108 #pdb.set_trace()
109
110 # Iterate over all remapped reads in order to generate for each read an
111 # training / prediction example
112 instance_counter = 0
113
114 for id,currentFRead in all_filtered_reads.items():
115
116 try:
117 currentRReads = all_remapped_reads[id]
118 except:
119 currentRReads = None
120
121 try:
122 gene_id = currentFRead['gene_id']
123 chro = currentFRead['chr']
124 currentGene = allGenes[chro][gene_id]
125 except:
126 pdb.set_trace()
127
128 if currentFRead['strand'] != '+':
129 #print 'wrong strand'
130 continue
131
132 seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
133
134 if seq == '':
135 continue
136
137 Sequences.append(seq)
138 Acceptors.append(acc)
139 Donors.append(don)
140 Exons.append(exons)
141 Ests.append(est)
142 OriginalEsts.append(original_est)
143 Qualities.append(qual)
144 SplitPositions.append(spos)
145 Ids.append(id)
146
147 instance_counter += 1
148
149 if instance_counter % 100 == 0:
150 print 'processed %d examples' % instance_counter
151
152 if instance_counter == 5000:
153 break
154
155 print 'Full dataset has size %d' % len(Sequences)
156
157 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
158 'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
159 'SplitPositions':SplitPositions,'Ids':Ids}
160
161 # saving new dataset
162 io_pickle.save(dataset_file,dataset)
163
164
165 def process_read(reReads,fRead,currentGene,dna_flat_files,test):
166 """
167 The purpose of this function is to create an example for QPalma
168 by using a
169
170 """
171 nil = ('','','','','','','','')
172 extended_alphabet = ['-','a','c','g','t','n','[',']']
173 alphabet = ['-','a','c','g','t','n']
174
175 chr = fRead['chr']
176 strand = fRead['strand']
177 #quality = fRead['prb']
178 quality = fRead['cal_prb']
179 #quality = fRead['chastity']
180 spos = fRead['splitpos']
181 currentReadSeq = fRead['seq']
182 currentReadSeq = currentReadSeq.lower()
183
184 #if currentReadSeq.find('[') == -1:
185 # print warning
186 # return nil
187
188 originalReadSeq = fRead['seq']
189
190 chrom = 'chr%d' % chr
191 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
192
193 # use matlab-style functions to access dna sequence
194 currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
195 currentDNASeq = currentDNASeq.lower()
196
197 assert currentDNASeq != '', 'load_genomic returned empty sequence!'
198
199 seq_has_indels = False
200 for elem in currentReadSeq:
201 #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
202 if not elem in extended_alphabet:
203 return nil
204
205 if elem == ']':
206 seq_has_indels = True
207
208 for elem in currentDNASeq:
209 #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
210 if not elem in alphabet:
211 return nil
212
213 if strand == '-':
214 try:
215 currentDNASeq = reverse_complement(currentDNASeq)
216 except:
217 print 'invalid char'
218 return nil
219
220 currentDNASeq = 'a' + currentDNASeq
221
222 exon_stop = fRead['exon_stop']
223 exon_start = fRead['exon_start']
224
225 # assertions to check whether exon start/stop positions are next to donor
226 # resp. acceptor sites
227 exonEnd = exon_stop - currentGene.start
228 exonBegin = exon_start - currentGene.start
229
230 current_don = currentDNASeq[exonEnd+1:exonEnd+3]
231 current_acc = currentDNASeq[exonBegin-2:exonBegin]
232 if not current_don == 'gt' or current_don == 'gc':
233 return nil
234
235 if not current_acc == 'ag':
236 return nil
237
238 #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
239 #assert current_acc == 'ag', pdb.set_trace()
240
241 # assertions for don/acc sites held now we recalculate offset
242 exon_stop = exon_stop - currentGene.start
243 exon_start = exon_start - currentGene.start
244
245 if not seq_has_indels:
246 p_start = fRead['p_start']
247 exon_stop = fRead['exon_stop']
248 exon_start = fRead['exon_start']
249 p_stop = fRead['p_stop']
250
251 assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
252
253 else:
254 # if we have indels we have to take care of the exons boundaries
255 firstReadSeq = currentReadSeq[:spos]
256 secondReadSeq = currentReadSeq[spos:]
257
258 est_fragment = []
259 r_idx = 0
260 while True:
261 if not r_idx < len(currentReadSeq):
262 break
263
264 if currentReadSeq[r_idx] == '[':
265 est_fragment.append(currentReadSeq[r_idx+2])
266 r_idx += 4
267 continue
268
269 est_fragment.append(currentReadSeq[r_idx])
270 r_idx += 1
271
272 dna_fragment_1 = []
273 r_idx = 0
274 while True:
275 if not r_idx < len(firstReadSeq):
276 break
277
278 if firstReadSeq[r_idx] == '[':
279 dna_fragment_1.append(firstReadSeq[r_idx+1])
280 r_idx += 4
281 continue
282
283 dna_fragment_1.append(firstReadSeq[r_idx])
284 r_idx += 1
285
286 dna_fragment_2 = []
287 r_idx = 0
288 while True:
289 if not r_idx < len(secondReadSeq):
290 break
291
292 if secondReadSeq[r_idx] == '[':
293 dna_fragment_2.append(secondReadSeq[r_idx+1])
294 r_idx += 4
295 continue
296
297 dna_fragment_2.append(secondReadSeq[r_idx])
298 r_idx += 1
299
300 currentReadSeq = "".join(est_fragment)
301 dna_fragment_1 = "".join(dna_fragment_1)
302 dna_fragment_2 = "".join(dna_fragment_2)
303
304 dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
305 dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
306
307 # new check the fragment of the annotation
308 new_exon1_size = dna_fragment_1_size
309 p_start = exon_stop - new_exon1_size + 1
310
311 dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
312
313 # new check the fragment of the annotation
314 new_exon2_size = dna_fragment_2_size
315 p_stop = exon_start + new_exon2_size - 1
316
317 dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
318
319 #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
320 #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
321
322 if not dna_fragment_1.replace('-','') == dna_annot_1:
323 print 'genomic seq mismatch'
324 print 'orignal read:\t %s ' % originalReadSeq
325 print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
326 print 'dna_fragment_1:\t %s' % dna_fragment_1
327 print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
328 return nil
329
330 if not dna_fragment_2.replace('-','') == dna_annot_2:
331 print 'genomic seq mismatch'
332 print 'orignal read:\t %s ' % originalReadSeq
333 print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
334 print 'dna_fragment_2:\t %s' % dna_fragment_2
335 print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
336 return nil
337
338 #print 'successful'
339
340 for elem in currentReadSeq:
341 assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
342
343 #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
344 assert p_stop - p_start >= Conf.read_size
345
346 currentExons = zeros((2,2),dtype=numpy.int)
347
348 currentExons[0,0] = p_start
349 currentExons[0,1] = exon_stop
350
351 currentExons[1,0] = exon_start
352 currentExons[1,1] = p_stop
353
354 # determine cutting positions for sequences
355 up_cut = int(currentExons[0,0])
356 down_cut = int(currentExons[1,1])
357
358 # if we perform testing we cut a much wider region as we want to detect
359 # how good we perform on a region
360 import random
361 ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
362
363 if test:
364 up_cut = up_cut-ADD_POS
365 if up_cut < 0:
366 up_cut = 0
367
368 down_cut = down_cut+ADD_POS
369
370 if down_cut > len(currentDNASeq):
371 down_cut = len(currentDNASeq)
372
373 # cut out part of the sequence
374 currentDNASeq = currentDNASeq[up_cut:down_cut]
375
376 # ensure that index is correct
377 currentExons[0,1] += 1
378 currentExons[1,1] += 1
379
380 if test:
381 currentExons -= up_cut
382 else:
383 currentExons -= currentExons[0,0]
384
385 try:
386 if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
387 currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
388 print 'not aligned'
389 return nil
390
391 if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
392 print 'not aligned'
393 return nil
394
395 except:
396 return nil
397 #pdb.set_trace()
398
399 # now we want to use interval_query to get the predicted splice scores
400 # trained on the TAIR sequence and annotation
401 interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
402 num_fields = 1
403 num_intervals = 1
404 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
405 % (chr,strand)
406
407 ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
408
409 # fetch acceptor scores
410 gf = Genef()
411 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
412 assert num_hits <= len(currentDNASeq)
413
414 #print 'Acceptor hits: %d' % num_hits
415 pos = createIntArrayFromList([0]*num_hits)
416 indices = createIntArrayFromList([0]*num_hits)
417 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
418 gf.getResults(pos,indices,scores)
419
420 acc = [-inf]*(len(currentDNASeq))
421
422 for i in range(num_hits):
423 position = pos[i]
424 position -= (currentGene.start+up_cut)
425 if position < 2 or position > len(currentDNASeq)-1:
426 continue
427 assert 0 <= position < len(currentDNASeq), 'position index wrong'
428 acc[position] = scores[i]
429
430 acc = acc[1:] + [-inf]
431
432 del gf
433
434 # fetch donor scores
435 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
436 % (chr,strand)
437
438 gf = Genef()
439 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
440 assert num_hits <= len(currentDNASeq)
441
442 gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
443
444 #print 'Donor hits: %d' % num_hits
445 pos = createIntArrayFromList([0]*num_hits)
446 indices = createIntArrayFromList([0]*num_hits)
447 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
448 gf.getResults(pos,indices,scores)
449
450 don = [-inf]*(len(currentDNASeq))
451
452 try:
453 for i in range(num_hits):
454 position = pos[i]
455 position -= (currentGene.start+up_cut)
456 if position < 1 or position >= len(currentDNASeq)-1:
457 continue
458 don[position-1] = scores[i]
459 except:
460 return nil
461 #pdb.set_trace()
462
463 don = [-inf] + don[:-1]
464
465 #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
466 #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
467 #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
468 #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
469
470
471 if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
472 return nil
473
474 if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
475 return nil
476
477 if not (len(currentDNASeq)) == len(acc):
478 return nil
479
480 if not (len(currentDNASeq)) == len(don):
481 return nil
482
483 acc = [-inf] + acc[:-1]
484
485 return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
486
487
488 def reverse_complement(seq):
489 map = {'a':'t','c':'g','g':'c','t':'a'}
490
491 new_seq = [map[elem] for elem in seq]
492 new_seq.reverse()
493 new_seq = "".join(new_seq)
494
495 return new_seq
496
497 if __name__ == '__main__':
498 if len(sys.argv) == 1:
499 print info
500
501 assert len(sys.argv) == 6, help
502 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)