+ fixed a bug in the filtering (forgot to disable remove_ambiguities)
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import pdb
7 import io_pickle
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11
12 import qpalma
13 import qpalma.tools
14 from qpalma.tools.parseGff import parse_gff
15
16 from qpalma.parsers import FilteredReadParser, RemappedReadParser
17
18 from Genefinding import *
19
20 from genome_utils import load_genomic
21
22 import qpalma.Configuration as Conf
23
24 help = """
25
26 Usage of this script is:
27
28 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
29
30 """
31
32 info = """
33 The purpose of this script is to compile a full data set for QPalma out of the
34 following files:
35
36 - The information of the gff (gene annotation).
37
38 - The flat dna files with the sequence of the respective gene positions
39
40 - The solexa reads in the filtered version
41
42 - The solexa reads in the remapped version
43
44 This represents a dataset for use with QPalma
45 The entries are def'd as follows:
46
47 - 'id' a unique id for the read
48 - 'chr' the chromosome identifier
49 - 'strand' the identifier for the strand either '+' or '-'
50
51 - 'read_start'
52 - 'exon_stop'
53 - 'exon_start'
54 - 'read_stop'
55
56 - 'remappped_pos' - the position of the remapped read
57 """
58
59 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
60
61 assert os.path.exists(filtered_reads)
62 assert os.path.exists(remapped_reads)
63
64 assert not os.path.exists(dataset_file), 'The data_file already exists!'
65
66 joinp = os.path.join
67
68 # first read the gff file(s) and create pickled gene information
69 allGenes = [None]*6
70
71 for i in range(1,6):
72 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
73
74 print 'parsing filtered reads..'
75 frp = FilteredReadParser(filtered_reads)
76 all_filtered_reads = frp.parse()
77 print 'found %d filtered reads' % len(all_filtered_reads)
78
79 print 'parsing remapped reads'
80 rrp = RemappedReadParser(remapped_reads)
81 all_remapped_reads = rrp.parse()
82 if all_remapped_reads != None:
83 size_rem = len(all_remapped_reads)
84 else:
85 size_rem = 0
86
87 print 'found %d remapped reads' % size_rem
88
89 # this stores the new dataset
90 Sequences = []
91 Acceptors = []
92 Donors = []
93 Exons = []
94 Ests = []
95 OriginalEsts = []
96 Qualities = []
97 SplitPositions = []
98 Ids = []
99
100 #pdb.set_trace()
101
102 # Iterate over all remapped reads in order to generate for each read an
103 # training / prediction example
104 instance_counter = 0
105
106 for id,currentFRead in all_filtered_reads.items():
107
108 try:
109 currentRReads = all_remapped_reads[id]
110 except:
111 currentRReads = None
112
113 try:
114 gene_id = currentFRead['gene_id']
115 chro = currentFRead['chr']
116 currentGene = allGenes[chro][gene_id]
117 except:
118 pdb.set_trace()
119
120 if currentFRead['strand'] != '+':
121 #print 'wrong strand'
122 continue
123
124 seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
125
126 if seq == '':
127 continue
128
129 Sequences.append(seq)
130 Acceptors.append(acc)
131 Donors.append(don)
132 Exons.append(exons)
133 Ests.append(est)
134 OriginalEsts.append(original_est)
135 Qualities.append(qual)
136 SplitPositions.append(spos)
137 Ids.append(id)
138
139 instance_counter += 1
140
141 if instance_counter % 100 == 0:
142 print 'processed %d examples' % instance_counter
143
144 if instance_counter == 2000:
145 break
146
147 print 'Full dataset has size %d' % len(Sequences)
148
149 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
150 'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
151 'SplitPositions':SplitPositions,'Ids':Ids}
152
153 # saving new dataset
154 io_pickle.save(dataset_file,dataset)
155
156
157 def process_read(reReads,fRead,currentGene,dna_flat_files,test):
158 """
159 The purpose of this function is to create an example for QPalma
160 by using a
161
162 """
163 nil = ('','','','','','','','')
164 extended_alphabet = ['-','a','c','g','t','n','[',']']
165 alphabet = ['-','a','c','g','t','n']
166
167 chr = fRead['chr']
168 strand = fRead['strand']
169 quality = fRead['prb']
170 spos = fRead['splitpos']
171 currentReadSeq = fRead['seq']
172 currentReadSeq = currentReadSeq.lower()
173
174 if currentReadSeq.find('[') == -1:
175 return nil
176
177 originalReadSeq = fRead['seq']
178
179 chrom = 'chr%d' % chr
180 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
181
182 # use matlab-style functions to access dna sequence
183 currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
184 currentDNASeq = currentDNASeq.lower()
185
186 assert currentDNASeq != '', 'load_genomic returned empty sequence!'
187
188 seq_has_indels = False
189 for elem in currentReadSeq:
190 assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
191 if elem == ']':
192 seq_has_indels = True
193
194 for elem in currentDNASeq:
195 assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
196
197 if strand == '-':
198 try:
199 currentDNASeq = reverse_complement(currentDNASeq)
200 except:
201 print 'invalid char'
202 return nil
203
204 currentDNASeq = 'a' + currentDNASeq
205
206 exon_stop = fRead['exon_stop']
207 exon_start = fRead['exon_start']
208
209 # assertions to check whether exon start/stop positions are next to donor
210 # resp. acceptor sites
211 exonEnd = exon_stop - currentGene.start
212 exonBegin = exon_start - currentGene.start
213
214 current_don = currentDNASeq[exonEnd+1:exonEnd+3]
215 current_acc = currentDNASeq[exonBegin-2:exonBegin]
216 if not current_don == 'gt' or current_don == 'gc':
217 return nil
218
219 if not current_acc == 'ag':
220 return nil
221
222 #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
223 #assert current_acc == 'ag', pdb.set_trace()
224
225 # assertions for don/acc sites held now we recalculate offset
226 exon_stop = exon_stop - currentGene.start
227 exon_start = exon_start - currentGene.start
228
229 if not seq_has_indels:
230 p_start = fRead['p_start']
231 exon_stop = fRead['exon_stop']
232 exon_start = fRead['exon_start']
233 p_stop = fRead['p_stop']
234
235 assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
236
237 else:
238 # if we have indels we have to take care of the exons boundaries
239 firstReadSeq = currentReadSeq[:spos]
240 secondReadSeq = currentReadSeq[spos:]
241
242 est_fragment = []
243 r_idx = 0
244 while True:
245 if not r_idx < len(currentReadSeq):
246 break
247
248 if currentReadSeq[r_idx] == '[':
249 est_fragment.append(currentReadSeq[r_idx+2])
250 r_idx += 4
251 continue
252
253 est_fragment.append(currentReadSeq[r_idx])
254 r_idx += 1
255
256 dna_fragment_1 = []
257 r_idx = 0
258 while True:
259 if not r_idx < len(firstReadSeq):
260 break
261
262 if firstReadSeq[r_idx] == '[':
263 dna_fragment_1.append(firstReadSeq[r_idx+1])
264 r_idx += 4
265 continue
266
267 dna_fragment_1.append(firstReadSeq[r_idx])
268 r_idx += 1
269
270 dna_fragment_2 = []
271 r_idx = 0
272 while True:
273 if not r_idx < len(secondReadSeq):
274 break
275
276 if secondReadSeq[r_idx] == '[':
277 dna_fragment_2.append(secondReadSeq[r_idx+1])
278 r_idx += 4
279 continue
280
281 dna_fragment_2.append(secondReadSeq[r_idx])
282 r_idx += 1
283
284 currentReadSeq = "".join(est_fragment)
285 dna_fragment_1 = "".join(dna_fragment_1)
286 dna_fragment_2 = "".join(dna_fragment_2)
287
288 dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
289 dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
290
291 # new check the fragment of the annotation
292 new_exon1_size = dna_fragment_1_size
293 p_start = exon_stop - new_exon1_size + 1
294
295 dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
296
297 # new check the fragment of the annotation
298 new_exon2_size = dna_fragment_2_size
299 p_stop = exon_start + new_exon2_size - 1
300
301 dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
302
303 #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
304 #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
305
306 if not dna_fragment_1.replace('-','') == dna_annot_1:
307 print 'genomic seq mismatch'
308 return nil
309
310 if not dna_fragment_2.replace('-','') == dna_annot_2:
311 print 'genomic seq mismatch'
312 print originalReadSeq
313 print firstReadSeq
314 print secondReadSeq
315 print dna_annot_1 + dna_annot_2
316 return nil
317
318 #print 'successful'
319
320 for elem in currentReadSeq:
321 assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
322
323 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
324 assert p_stop - p_start >= Conf.read_size
325
326 currentExons = zeros((2,2),dtype=numpy.int)
327
328 currentExons[0,0] = p_start
329 currentExons[0,1] = exon_stop
330
331 currentExons[1,0] = exon_start
332 currentExons[1,1] = p_stop
333
334 # determine cutting positions for sequences
335 up_cut = int(currentExons[0,0])
336 down_cut = int(currentExons[1,1])
337
338 # if we perform testing we cut a much wider region as we want to detect
339 # how good we perform on a region
340 import random
341 ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
342
343 if test:
344 up_cut = up_cut-ADD_POS
345 if up_cut < 0:
346 up_cut = 0
347
348 down_cut = down_cut+ADD_POS
349
350 if down_cut > len(currentDNASeq):
351 down_cut = len(currentDNASeq)
352
353 # cut out part of the sequence
354 currentDNASeq = currentDNASeq[up_cut:down_cut]
355
356 # ensure that index is correct
357 currentExons[0,1] += 1
358 currentExons[1,1] += 1
359
360 if test:
361 currentExons -= up_cut
362 else:
363 currentExons -= currentExons[0,0]
364
365 try:
366 if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
367 currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
368 print 'not aligned'
369 return nil
370
371 if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
372 print 'not aligned'
373 return nil
374
375 except:
376 pdb.set_trace()
377
378 # now we want to use interval_query to get the predicted splice scores
379 # trained on the TAIR sequence and annotation
380 interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
381 num_fields = 1
382 num_intervals = 1
383 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
384 % (chr,strand)
385
386 ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
387
388 # fetch acceptor scores
389 gf = Genef()
390 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
391 assert num_hits <= len(currentDNASeq)
392
393 #print 'Acceptor hits: %d' % num_hits
394 pos = createIntArrayFromList([0]*num_hits)
395 indices = createIntArrayFromList([0]*num_hits)
396 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
397 gf.getResults(pos,indices,scores)
398
399 acc = [-inf]*(len(currentDNASeq))
400
401 for i in range(num_hits):
402 position = pos[i]
403 position -= (currentGene.start+up_cut)
404 if position < 2 or position > len(currentDNASeq)-1:
405 continue
406 assert 0 <= position < len(currentDNASeq), 'position index wrong'
407 acc[position] = scores[i]
408
409 acc = acc[1:] + [-inf]
410
411 del gf
412
413 # fetch donor scores
414 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
415 % (chr,strand)
416
417 gf = Genef()
418 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
419 assert num_hits <= len(currentDNASeq)
420
421 gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
422
423 #print 'Donor hits: %d' % num_hits
424 pos = createIntArrayFromList([0]*num_hits)
425 indices = createIntArrayFromList([0]*num_hits)
426 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
427 gf.getResults(pos,indices,scores)
428
429 don = [-inf]*(len(currentDNASeq))
430
431 try:
432 for i in range(num_hits):
433 position = pos[i]
434 position -= (currentGene.start+up_cut)
435 if position < 1 or position >= len(currentDNASeq)-1:
436 continue
437 don[position-1] = scores[i]
438 except:
439 pdb.set_trace()
440
441 don = [-inf] + don[:-1]
442
443 assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
444 assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
445
446 assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
447 assert (len(currentDNASeq)) == len(don), pdb.set_trace()
448
449 acc = [-inf] + acc[:-1]
450
451
452
453 return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
454
455
456 def reverse_complement(seq):
457 map = {'a':'t','c':'g','g':'c','t':'a'}
458
459 new_seq = [map[elem] for elem in seq]
460 new_seq.reverse()
461 new_seq = "".join(new_seq)
462
463 return new_seq
464
465 if __name__ == '__main__':
466 if len(sys.argv) == 1:
467 print info
468
469 assert len(sys.argv) == 6, help
470 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)