+ minor changes to filterReads
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import pdb
7 import io_pickle
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11
12 import qpalma
13 import qpalma.tools
14 from qpalma.tools.parseGff import parse_gff
15
16 from qpalma.parsers import FilteredReadParser, RemappedReadParser
17
18 from Genefinding import *
19
20 from genome_utils import load_genomic
21
22 help = """
23
24 Usage of this script is:
25
26 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
27
28 """
29
30 info = """
31 The purpose of this script is to compile a full data set for QPalma out of the
32 following files:
33
34 - The information of the gff (gene annotation).
35
36 - The flat dna files with the sequence of the respective gene positions
37
38 - The solexa reads in the filtered version
39
40 - The solexa reads in the remapped version
41
42 This represents a dataset for use with QPalma
43 The entries are def'd as follows:
44
45 - 'id' a unique id for the read
46 - 'chr' the chromosome identifier
47 - 'strand' the identifier for the strand either '+' or '-'
48
49 - 'read_start'
50 - 'exon_stop'
51 - 'exon_start'
52 - 'read_stop'
53
54 - 'remappped_pos' - the position of the remapped read
55 """
56
57 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
58
59 assert os.path.exists(filtered_reads)
60 assert os.path.exists(remapped_reads)
61
62 assert not os.path.exists(dataset_file), 'The data_file already exists!'
63
64 joinp = os.path.join
65
66 # first read the gff file(s) and create pickled gene information
67 allGenes = [None]*6
68
69 for i in range(1,6):
70 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
71
72 print 'parsing filtered reads..'
73 frp = FilteredReadParser(filtered_reads)
74 all_filtered_reads = frp.parse()
75 print 'found %d filtered reads' % len(all_filtered_reads)
76
77 print 'parsing remapped reads'
78 rrp = RemappedReadParser(remapped_reads)
79 all_remapped_reads = rrp.parse()
80 if all_remapped_reads != None:
81 size_rem = len(all_remapped_reads)
82 else:
83 size_rem = 0
84
85 print 'found %d remapped reads' % size_rem
86
87 # this stores the new dataset
88 Sequences = []
89 Acceptors = []
90 Donors = []
91 Exons = []
92 Ests = []
93 Qualities = []
94 SplitPositions = []
95 Ids = []
96
97 #pdb.set_trace()
98
99 # Iterate over all remapped reads in order to generate for each read an
100 # training / prediction example
101 instance_counter = 0
102
103 for id,currentFRead in all_filtered_reads.items():
104
105 try:
106 currentRReads = all_remapped_reads[id]
107 except:
108 currentRReads = None
109
110 try:
111 gene_id = currentFRead['gene_id']
112 chro = currentFRead['chr']
113 currentGene = allGenes[chro][gene_id]
114 except:
115 pdb.set_trace()
116
117 if currentFRead['strand'] != '+':
118 continue
119
120 seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
121
122 if seq == '':
123 continue
124
125 Sequences.append(seq)
126 Acceptors.append(acc)
127 Donors.append(don)
128 Exons.append(exons)
129 Ests.append(est)
130 Qualities.append(qual)
131 SplitPositions.append(spos)
132 Ids.append(id)
133
134 instance_counter += 1
135
136 if instance_counter % 100 == 0:
137 print 'processed %d examples' % instance_counter
138
139 if instance_counter == 1000:
140 break
141
142 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
143 'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
144 'SplitPositions':SplitPositions,'Ids':Ids}
145
146 # saving new dataset
147 io_pickle.save(dataset_file,dataset)
148
149
150 def process_read(reReads,fRead,currentGene,dna_flat_files,test):
151 """
152 The purpose of this function is to create an example for QPalma
153 by using a
154
155 """
156 nil = ('','','','','','','')
157
158 chr = fRead['chr']
159 strand = fRead['strand']
160
161 chrom = 'chr%d' % chr
162 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
163
164 # use matlab-style functions to access dna sequence
165 currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
166 currentSeq = currentSeq.lower()
167
168 assert currentSeq != '', 'load_genomic returned empty sequence!'
169
170 if strand == '-':
171 try:
172 currentSeq = reverse_complement(currentSeq)
173 except:
174 print 'invalid char'
175 return nil
176
177 p_start = fRead['p_start']
178 exon_stop = fRead['exon_stop']
179 exon_start = fRead['exon_start']
180 p_stop = fRead['p_stop']
181
182 currentReadSeq = fRead['seq']
183 quality = fRead['prb']
184 spos = fRead['splitpos']
185
186 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
187 assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == 36, 'Invalid read size'
188 assert p_stop - p_start >= 36
189
190 currentExons = zeros((2,2),dtype=numpy.int)
191
192 currentExons[0,0] = p_start
193 currentExons[0,1] = exon_stop
194
195 currentExons[1,0] = exon_start
196 currentExons[1,1] = p_stop
197
198 # make indices relative to start pos of the current gene
199 currentExons -= currentGene.start
200
201 # determine cutting positions for sequences
202 up_cut = int(currentExons[0,0])
203 down_cut = int(currentExons[1,1])
204
205 # if we perform testing we cut a much wider region as we want to detect
206 # how good we perform on a region
207 import random
208 ADD_POS = random.randint(1,20)
209
210 if test:
211 up_cut = up_cut-ADD_POS
212 if up_cut < 0:
213 up_cut = 0
214
215 down_cut = down_cut+ADD_POS
216
217 if down_cut > len(currentSeq):
218 down_cut = len(currentSeq)
219
220 # cut out part of the sequence
221 currentSeq = currentSeq[up_cut:down_cut]
222
223 # ensure that index is correct
224 currentExons[0,1] += 1
225
226 if test:
227 currentExons -= up_cut
228 else:
229 currentExons -= currentExons[0,0]
230
231 try:
232 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
233 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
234 print 'not aligned'
235 return nil
236
237 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
238 print 'not aligned'
239 return nil
240
241 except:
242 pdb.set_trace()
243
244 # now we want to use interval_query to get the predicted splice scores
245 # trained on the TAIR sequence and annotation
246 interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
247 num_fields = 1
248 num_intervals = 1
249 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
250 % (chr,strand)
251
252 ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
253
254 # fetch acceptor scores
255 gf = Genef()
256 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
257 assert num_hits <= len(currentSeq)
258
259 #print 'Acceptor hits: %d' % num_hits
260 pos = createIntArrayFromList([0]*num_hits)
261 indices = createIntArrayFromList([0]*num_hits)
262 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
263 gf.getResults(pos,indices,scores)
264
265 acc = [-inf]*(len(currentSeq))
266
267 for i in range(num_hits):
268 position = pos[i]
269 position -= (currentGene.start+up_cut)
270 if position < 2 or position > len(currentSeq)-1:
271 continue
272 assert 0 <= position < len(currentSeq), 'position index wrong'
273 acc[position] = scores[i]
274
275 acc = acc[1:] + [-inf]
276
277 del gf
278
279 # fetch donor scores
280 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
281 % (chr,strand)
282
283 gf = Genef()
284 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
285 assert num_hits <= len(currentSeq)
286
287 gt_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and e=='g' and (currentSeq[p+1]=='t' or currentSeq[p+1]=='c')]
288
289 #print 'Donor hits: %d' % num_hits
290 pos = createIntArrayFromList([0]*num_hits)
291 indices = createIntArrayFromList([0]*num_hits)
292 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
293 gf.getResults(pos,indices,scores)
294
295 don = [-inf]*(len(currentSeq))
296
297 try:
298 for i in range(num_hits):
299 position = pos[i]
300 position -= (currentGene.start+up_cut)
301 if position < 1 or position >= len(currentSeq)-1:
302 continue
303 don[position-1] = scores[i]
304 except:
305 pdb.set_trace()
306
307 don = [-inf] + don[:-1]
308
309 assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
310 assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
311
312 assert (len(currentSeq)) == len(acc), pdb.set_trace()
313 assert (len(currentSeq)) == len(don), pdb.set_trace()
314
315 acc = [-inf] + acc[:-1]
316
317 currentExons[1,1] += 1
318
319 return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
320
321
322 def reverse_complement(seq):
323 map = {'a':'t','c':'g','g':'c','t':'a'}
324
325 new_seq = [map[elem] for elem in seq]
326 new_seq.reverse()
327 new_seq = "".join(new_seq)
328
329 return new_seq
330
331 if __name__ == '__main__':
332 if len(sys.argv) == 1:
333 print info
334
335 assert len(sys.argv) == 6, help
336 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)