+ fixed bug in the index calculation of donor/acceptor scores
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import cPickle
10
11 import numpy
12 from numpy.matlib import mat,zeros,ones,inf
13
14 import qpalma
15 import qpalma.tools
16 from qpalma.tools.parseGff import parse_gff
17
18 from qpalma.parsers import FilteredReadParser, RemappedReadParser, MapParser
19
20 from Genefinding import *
21
22 from genome_utils import load_genomic
23
24 import qpalma.Configuration as Conf
25
26 warning = """
27 #########################################################
28 WARNING ! Disabled the use of non-bracket reads WARNING !
29 #########################################################
30
31 """
32
33
34 help = """
35
36 Usage of this script is:
37
38 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
39
40 """
41
42 info = """
43 The purpose of this script is to compile a full data set for QPalma out of the
44 following files:
45
46 - The information of the gff (gene annotation).
47
48 - The flat dna files with the sequence of the respective gene positions
49
50 - The solexa reads in the filtered version
51
52 - The solexa reads in the remapped version
53
54 This represents a dataset for use with QPalma
55 The entries are def'd as follows:
56
57 - 'id' a unique id for the read
58 - 'chr' the chromosome identifier
59 - 'strand' the identifier for the strand either '+' or '-'
60
61 - 'read_start'
62 - 'exon_stop'
63 - 'exon_start'
64 - 'read_stop'
65
66 - 'remappped_pos' - the position of the remapped read
67 """
68
69 # some useful constants
70
71 extended_alphabet = ['-','a','c','g','t','n','[',']']
72 alphabet = ['-','a','c','g','t','n']
73
74 # some dummy elements to be returned if current example violates assertions
75 nil_dna_frag = ('','','','','')
76 remapped_nil = ('')
77 process_nil = ('','','','','','','','')
78
79 nil_splice_scores = ('','')
80
81 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
82
83 assert os.path.exists(filtered_reads)
84 assert os.path.exists(remapped_reads)
85 assert not os.path.exists(dataset_file), 'The data_file already exists!'
86
87 joinp = os.path.join
88
89 # first read the gff file(s) and create pickled gene information
90 allGenes = [None]*6
91
92 for i in range(1,6):
93 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
94
95 print 'parsing filtered reads..'
96 frp = FilteredReadParser(filtered_reads)
97 all_filtered_reads = frp.parse()
98
99 print 'found %d filtered reads' % len(all_filtered_reads)
100
101 print 'parsing map reads'
102 rrp = MapParser(remapped_reads)
103 all_remapped_reads = rrp.parse()
104
105 if all_remapped_reads != None:
106 size_rem = len(all_remapped_reads)
107 else:
108 size_rem = 0
109
110 print 'found %d remapped reads' % size_rem
111
112 # this stores the new dataset
113 SeqInfo = []
114 Exons = []
115 OriginalEsts = []
116 Qualities = []
117 AlternativeSequences = []
118
119 # Iterate over all remapped reads in order to generate for each read an
120 # training / prediction example
121 instance_counter = 0
122
123 for fId,currentFRead in all_filtered_reads.items():
124
125 reId = str(1000000000000+int(fId))
126 try:
127 reReads = all_remapped_reads[reId]
128 except:
129 continue
130
131 try:
132 gene_id = currentFRead['gene_id']
133 chro = currentFRead['chr']
134 except:
135 continue
136
137 if currentFRead['strand'] != '+':
138 #print 'wrong strand'
139 continue
140
141 seq_info, exons =\
142 process_filtered_read(currentFRead,dna_flat_files,test)
143
144 original_est = currentFRead['seq']
145
146 alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
147
148 SeqInfo.append(seq_info)
149 Exons.append(exons)
150 OriginalEsts.append(original_est)
151
152 quality = currentFRead['prb']
153 cal_prb = currentFRead['cal_prb']
154 chastity = currentFRead['chastity']
155
156 Qualities.append( (quality,cal_prb,chastity) )
157 AlternativeSequences.append(alternative_seq)
158
159 instance_counter += 1
160
161 if instance_counter % 1000 == 0:
162 print 'processed %d examples' % instance_counter
163
164 if instance_counter == 20000:
165 break
166
167 print 'Full dataset has size %d' % len(SeqInfo)
168
169 dataset = [SeqInfo, Exons, OriginalEsts, Qualities,\
170 AlternativeSequences]
171
172 # saving new dataset
173 #io_pickle.save(dataset_file,dataset)
174
175 cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
176
177
178 def process_filtered_read(fRead,dna_flat_files,test):
179 """
180 Creates a training example
181 """
182
183 chr = fRead['chr']
184 strand = fRead['strand']
185
186 genomic_start = fRead['p_start']
187 genomic_stop = fRead['p_stop']
188
189 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
190
191 if genomic_start <= CUT_OFFSET:
192 up_cut = genomic_start
193 else:
194 up_cut = genomic_start - CUT_OFFSET
195
196 #if genomic_stop + CUT_OFFSET > :
197 # down_cut = currentGene.stop - currentExons[1,1]
198 #else:
199 down_cut = genomic_stop + CUT_OFFSET
200
201 seq_info = (chr,strand,up_cut,down_cut)
202
203 # now that we have p_start and p_stop properly
204 # we can define the correct exons borders
205 currentExons = zeros((2,2),dtype=numpy.int)
206
207 currentExons[0,0] = fRead['p_start']
208 currentExons[0,1] = fRead['exon_stop']
209
210 currentExons[1,0] = fRead['exon_start']
211 currentExons[1,1] = fRead['p_stop']
212
213 return seq_info, currentExons
214
215
216 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
217 """
218 Now we want to use interval_query to get the predicted splice scores trained
219 on the TAIR sequence and annotation.
220 """
221
222 assert intervalEnd > intervalBegin
223
224 size = intervalEnd-intervalBegin
225
226 acc = size*[0.0]
227 don = size*[0.0]
228
229 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
230 pos_size = new_intp()
231 intp_assign(pos_size,1)
232
233 # fetch acceptor scores
234 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
235 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
236
237 num_hits = result[0]
238 assert num_hits <= size, pdb.set_trace()
239 pos = result[1]
240 scores = result[3]
241
242 acc = [-inf]*(size)
243
244 #print [p - intervalBegin for p in pos]
245
246 for i in range(num_hits):
247 position = pos[i] - intervalBegin
248 if not (position > 1 and position < size):
249 continue
250
251 assert 0 <= position <= size, pdb.set_trace()
252 acc[position] = scores[i]
253
254 # fetch acceptor scores
255 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
256 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
257
258 num_hits = result[0]
259 assert num_hits <= size, pdb.set_trace()
260 pos = result[1]
261 scores = result[3]
262
263 don = [-inf]*(size)
264
265 #print [p - intervalBegin for p in pos]
266
267 for i in range(num_hits):
268 position = pos[i] - intervalBegin
269 if not ( position > 1 and position < size ):
270 continue
271
272 assert 0 <= position <= size, pdb.set_trace()
273 don[position] = scores[i]
274
275 return acc, don
276
277
278 def process_map(reReads,fRead,dna_flat_files):
279 """
280 For all matches found by Vmatch we calculate the fragment of the DNA we
281 would like to perform an aligment during prediction.
282 """
283
284 fragment_size = 3000
285
286 alternativeSeq = []
287
288 for currentRRead in reReads:
289 rId = currentRRead['id']
290 pos = currentRRead['pos']
291 chr = currentRRead['chr']
292 strand = currentRRead['strand']
293 seq = currentRRead['seq']
294
295 if strand != '+':
296 continue
297
298 length = currentRRead['length']
299 offset = currentRRead['offset']
300
301 currentLabel = False
302
303 # vmatch found the correct position
304 if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start'] <= pos <= fRead['p_stop']:
305 currentLabel = True
306
307 # first half of the read matches somewhere
308 # search downstream for the rest
309 if offset == 0:
310 genomicSeq_start = pos
311 genomicSeq_stop = pos + fragment_size
312 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
313 continue
314
315 # second half of the read matches somewhere
316 # search upstream for the rest
317 elif offset != 0 and length + offset == Conf.read_size:
318 genomicSeq_start = pos - fragment_size
319 genomicSeq_stop = pos+length
320 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
321 continue
322
323 # middle part of the read matches somewhere
324 # search up/downstream for the rest
325 else:
326 genomicSeq_start = pos - (fragment_size/2)
327 genomicSeq_stop = pos + (fragment_size/2)
328 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
329
330 #genomicSeq_start = pos - fragment_size
331 #genomicSeq_stop = pos+36
332 #alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
333 continue
334
335 return alternativeSeq
336
337
338 def check_remapped_reads(reReads,fRead,dna_flat_files):
339 """
340 For all matches found by Vmatch we calculate the fragment of the DNA we
341 would like to perform an aligment during prediction.
342 """
343
344 fragment_size = 3000
345
346 seq = fRead['seq']
347 strand = fRead['strand']
348 pos = fRead['p_start']
349 chr = fRead['chr']
350
351 print 'fRead:\t%d %d %d %d' %\
352 (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
353
354 for currentRRead in reReads:
355 rId = currentRRead['id']
356
357 pos1 = currentRRead['pos1']
358 chr1 = currentRRead['chr1']
359 seq1 = currentRRead['seq1']
360
361 seq2 = currentRRead['seq2']
362 pos2 = 0
363
364 if not seq2 == '':
365 pos2 = currentRRead['pos2']
366 chr2 = currentRRead['chr2']
367 s2_pos = seq.find(seq2)
368
369 print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
370
371
372 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
373 """
374 This function expects an interval, chromosome and strand information and
375 returns then the genomic sequence of this interval and the associated scores.
376 """
377
378 chrom = 'chr%d' % chr
379 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
380 genomicSeq = genomicSeq.lower()
381
382 # check the obtained dna sequence
383 assert genomicSeq != '', 'load_genomic returned empty sequence!'
384 #for elem in genomicSeq:
385 # if not elem in alphabet:
386
387 no_base = re.compile('[^acgt]')
388 genomicSeq = no_base.sub('n',genomicSeq)
389
390 intervalBegin = genomicSeq_start-100
391 intervalEnd = genomicSeq_stop+100
392 seq_pos_offset = genomicSeq_start
393
394 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
395
396 currentAcc = currentAcc[100:-100]
397 currentDon = currentDon[100:-100]
398
399 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq)-1 and genomicSeq[p-2]=='a' and genomicSeq[p-1]=='g' ]
400 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
401
402 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
403 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
404
405 return genomicSeq, currentAcc, currentDon
406
407
408 def reverse_complement(seq):
409 map = {'a':'t','c':'g','g':'c','t':'a'}
410
411 new_seq = [map[elem] for elem in seq]
412 new_seq.reverse()
413 new_seq = "".join(new_seq)
414
415 return new_seq
416
417 def get_seq(begin,end,exon_end):
418 """
419 """
420
421 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
422
423 if exon_end:
424 gene_start = begin
425 gene_stop = end+2
426 else:
427 gene_start = begin-2
428 gene_stop = end
429
430 chrom = 'chr%d' % 1
431 strand = '+'
432
433 genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
434 genomicSeq = genomicSeq.lower()
435
436 return genomicSeq
437
438 if __name__ == '__main__':
439 if len(sys.argv) == 1:
440 print info
441
442 assert len(sys.argv) == 6, help
443 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)