+ modified dataset generation in order to recalculate exons boundaries
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import pdb
7 import io_pickle
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11
12 import qpalma
13 import qpalma.tools
14 from qpalma.tools.parseGff import parse_gff
15
16 from qpalma.parsers import FilteredReadParser, RemappedReadParser
17
18 from Genefinding import *
19
20 from genome_utils import load_genomic
21
22 import qpalma.Configuration as Conf
23
24 help = """
25
26 Usage of this script is:
27
28 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
29
30 """
31
32 info = """
33 The purpose of this script is to compile a full data set for QPalma out of the
34 following files:
35
36 - The information of the gff (gene annotation).
37
38 - The flat dna files with the sequence of the respective gene positions
39
40 - The solexa reads in the filtered version
41
42 - The solexa reads in the remapped version
43
44 This represents a dataset for use with QPalma
45 The entries are def'd as follows:
46
47 - 'id' a unique id for the read
48 - 'chr' the chromosome identifier
49 - 'strand' the identifier for the strand either '+' or '-'
50
51 - 'read_start'
52 - 'exon_stop'
53 - 'exon_start'
54 - 'read_stop'
55
56 - 'remappped_pos' - the position of the remapped read
57 """
58
59 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
60
61 assert os.path.exists(filtered_reads)
62 assert os.path.exists(remapped_reads)
63
64 assert not os.path.exists(dataset_file), 'The data_file already exists!'
65
66 joinp = os.path.join
67
68 # first read the gff file(s) and create pickled gene information
69 allGenes = [None]*6
70
71 for i in range(1,6):
72 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
73
74 print 'parsing filtered reads..'
75 frp = FilteredReadParser(filtered_reads)
76 all_filtered_reads = frp.parse()
77 print 'found %d filtered reads' % len(all_filtered_reads)
78
79 print 'parsing remapped reads'
80 rrp = RemappedReadParser(remapped_reads)
81 all_remapped_reads = rrp.parse()
82 if all_remapped_reads != None:
83 size_rem = len(all_remapped_reads)
84 else:
85 size_rem = 0
86
87 print 'found %d remapped reads' % size_rem
88
89 # this stores the new dataset
90 Sequences = []
91 Acceptors = []
92 Donors = []
93 Exons = []
94 Ests = []
95 OriginalEsts = []
96 Qualities = []
97 SplitPositions = []
98 Ids = []
99
100 #pdb.set_trace()
101
102 # Iterate over all remapped reads in order to generate for each read an
103 # training / prediction example
104 instance_counter = 0
105
106 for id,currentFRead in all_filtered_reads.items():
107
108 try:
109 currentRReads = all_remapped_reads[id]
110 except:
111 currentRReads = None
112
113 try:
114 gene_id = currentFRead['gene_id']
115 chro = currentFRead['chr']
116 currentGene = allGenes[chro][gene_id]
117 except:
118 pdb.set_trace()
119
120 if currentFRead['strand'] != '+':
121 continue
122
123 seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
124
125 if seq == '':
126 continue
127
128 Sequences.append(seq)
129 Acceptors.append(acc)
130 Donors.append(don)
131 Exons.append(exons)
132 Ests.append(est)
133 OriginalEsts.append(original_est)
134 Qualities.append(qual)
135 SplitPositions.append(spos)
136 Ids.append(id)
137
138 instance_counter += 1
139
140 if instance_counter % 100 == 0:
141 print 'processed %d examples' % instance_counter
142
143 if instance_counter == 1000:
144 break
145
146
147 print 'Full dataset has size %d' % len(Sequences)
148
149 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
150 'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
151 'SplitPositions':SplitPositions,'Ids':Ids}
152
153 # saving new dataset
154 io_pickle.save(dataset_file,dataset)
155
156
157 def process_read(reReads,fRead,currentGene,dna_flat_files,test):
158 """
159 The purpose of this function is to create an example for QPalma
160 by using a
161
162 """
163 nil = ('','','','','','','')
164 extended_alphabet = ['-','a','c','g','t','n','[',']']
165 alphabet = ['-','a','c','g','t','n']
166
167 chr = fRead['chr']
168 strand = fRead['strand']
169 quality = fRead['prb']
170 spos = fRead['splitpos']
171 currentReadSeq = fRead['seq']
172 currentReadSeq = currentReadSeq.lower()
173
174 originalReadSeq = fRead['seq']
175
176 chrom = 'chr%d' % chr
177 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
178
179 # use matlab-style functions to access dna sequence
180 currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
181 currentDNASeq = currentDNASeq.lower()
182
183 assert currentDNASeq != '', 'load_genomic returned empty sequence!'
184
185 seq_has_indels = False
186 for elem in currentReadSeq:
187 assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
188 if elem == ']':
189 seq_has_indels = True
190
191 for elem in currentDNASeq:
192 assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
193
194 if strand == '-':
195 try:
196 currentDNASeq = reverse_complement(currentDNASeq)
197 except:
198 print 'invalid char'
199 return nil
200
201 exon_stop = fRead['exon_stop']
202 exon_start = fRead['exon_start']
203
204
205 if not seq_has_indels:
206 p_start = fRead['p_start']
207 exon_stop = fRead['exon_stop']
208 exon_start = fRead['exon_start']
209 p_stop = fRead['p_stop']
210
211 assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
212
213 else:
214 # if we have indels we have to take care of the exons boundaries
215 #pdb.set_trace()
216
217 import re
218 dna_del = re.compile('\[-(?P<base>[acgt])\]')
219 read_del = re.compile('\[[acgt](?P<base>-)\]')
220
221 elem_ctr = 0
222 r_idx = 0
223 while True:
224 if not r_idx < len(currentReadSeq):
225 break
226
227 if elem_ctr == spos:
228 break
229
230 if currentReadSeq[r_idx] == '[':
231 r_idx += 4
232 elem_ctr += 1
233
234 elem_ctr += 1
235 r_idx += 1
236
237 firstReadSeq = currentReadSeq[:r_idx]
238 secondReadSeq = currentReadSeq[r_idx:]
239
240 firstReadSeqPruned = re.sub(dna_del,'\g<base>',firstReadSeq)
241 firstReadSeqPruned = re.sub(read_del,'\g<base>',firstReadSeqPruned)
242
243 secondReadSeqPruned = re.sub(dna_del,'\g<base>',secondReadSeq)
244 secondReadSeqPruned = re.sub(read_del,'\g<base>',secondReadSeqPruned)
245
246 numDNADel = len(dna_del.findall(firstReadSeq))
247 numReadDel = len(read_del.findall(firstReadSeq))
248
249 new_exon1_size = len(firstReadSeqPruned) + numReadDel - numDNADel
250 p_start = exon_stop - new_exon1_size + 1
251
252 numDNADel = len(dna_del.findall(secondReadSeq))
253 numReadDel = len(read_del.findall(secondReadSeq))
254
255 new_exon2_size = len(secondReadSeqPruned) + numReadDel - numDNADel
256 p_stop = exon_start + new_exon2_size - 1
257
258 currentReadSeq = firstReadSeqPruned+secondReadSeqPruned
259 assert len(currentReadSeq) == Conf.read_size, 'Error with glued read'
260
261 #pdb.set_trace()
262
263 for elem in currentReadSeq:
264 assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
265
266 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
267 assert p_stop - p_start >= Conf.read_size
268
269 currentExons = zeros((2,2),dtype=numpy.int)
270
271 currentExons[0,0] = p_start
272 currentExons[0,1] = exon_stop
273
274 currentExons[1,0] = exon_start
275 currentExons[1,1] = p_stop
276
277 # make indices relative to start pos of the current gene
278 currentExons -= currentGene.start
279
280 # determine cutting positions for sequences
281 up_cut = int(currentExons[0,0])
282 down_cut = int(currentExons[1,1])
283
284 # if we perform testing we cut a much wider region as we want to detect
285 # how good we perform on a region
286 import random
287 ADD_POS = random.randint(250,1000)
288
289 if test:
290 up_cut = up_cut-ADD_POS
291 if up_cut < 0:
292 up_cut = 0
293
294 down_cut = down_cut+ADD_POS
295
296 if down_cut > len(currentDNASeq):
297 down_cut = len(currentDNASeq)
298
299 # cut out part of the sequence
300 currentDNASeq = currentDNASeq[up_cut:down_cut]
301
302 # ensure that index is correct
303 currentExons[0,1] += 1
304
305 if test:
306 currentExons -= up_cut
307 else:
308 currentExons -= currentExons[0,0]
309
310 try:
311 if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
312 currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
313 print 'not aligned'
314 return nil
315
316 if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
317 print 'not aligned'
318 return nil
319
320 except:
321 pdb.set_trace()
322
323 # now we want to use interval_query to get the predicted splice scores
324 # trained on the TAIR sequence and annotation
325 interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
326 num_fields = 1
327 num_intervals = 1
328 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
329 % (chr,strand)
330
331 ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
332
333 # fetch acceptor scores
334 gf = Genef()
335 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
336 assert num_hits <= len(currentDNASeq)
337
338 #print 'Acceptor hits: %d' % num_hits
339 pos = createIntArrayFromList([0]*num_hits)
340 indices = createIntArrayFromList([0]*num_hits)
341 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
342 gf.getResults(pos,indices,scores)
343
344 acc = [-inf]*(len(currentDNASeq))
345
346 for i in range(num_hits):
347 position = pos[i]
348 position -= (currentGene.start+up_cut)
349 if position < 2 or position > len(currentDNASeq)-1:
350 continue
351 assert 0 <= position < len(currentDNASeq), 'position index wrong'
352 acc[position] = scores[i]
353
354 acc = acc[1:] + [-inf]
355
356 del gf
357
358 # fetch donor scores
359 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
360 % (chr,strand)
361
362 gf = Genef()
363 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
364 assert num_hits <= len(currentDNASeq)
365
366 gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
367
368 #print 'Donor hits: %d' % num_hits
369 pos = createIntArrayFromList([0]*num_hits)
370 indices = createIntArrayFromList([0]*num_hits)
371 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
372 gf.getResults(pos,indices,scores)
373
374 don = [-inf]*(len(currentDNASeq))
375
376 try:
377 for i in range(num_hits):
378 position = pos[i]
379 position -= (currentGene.start+up_cut)
380 if position < 1 or position >= len(currentDNASeq)-1:
381 continue
382 don[position-1] = scores[i]
383 except:
384 pdb.set_trace()
385
386 don = [-inf] + don[:-1]
387
388 assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
389 assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
390
391 assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
392 assert (len(currentDNASeq)) == len(don), pdb.set_trace()
393
394 acc = [-inf] + acc[:-1]
395
396 currentExons[1,1] += 1
397
398 return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
399
400
401 def reverse_complement(seq):
402 map = {'a':'t','c':'g','g':'c','t':'a'}
403
404 new_seq = [map[elem] for elem in seq]
405 new_seq.reverse()
406 new_seq = "".join(new_seq)
407
408 return new_seq
409
410 if __name__ == '__main__':
411 if len(sys.argv) == 1:
412 print info
413
414 assert len(sys.argv) == 6, help
415 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)