+ extended dataset compilation function
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import pdb
7 import io_pickle
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11
12 import qpalma
13 import qpalma.tools
14 from qpalma.tools.parseGff import parse_gff
15
16 from qpalma.parsers import FilteredReadParser, RemappedReadParser
17
18 from Genefinding import *
19
20 from genome_utils import load_genomic
21
22 help = """
23
24 Usage of this script is:
25
26 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
27
28 """
29
30 info = """
31 The purpose of this script is to compile a full data set for QPalma out of the
32 following files:
33
34 - The information of the gff (gene annotation).
35
36 - The flat dna files with the sequence of the respective gene positions
37
38 - The solexa reads in the filtered version
39
40 - The solexa reads in the remapped version
41
42 This represents a dataset for use with QPalma
43 The entries are def'd as follows:
44
45 - 'id' a unique id for the read
46 - 'chr' the chromosome identifier
47 - 'strand' the identifier for the strand either '+' or '-'
48
49 - 'read_start'
50 - 'exon_stop'
51 - 'exon_start'
52 - 'read_stop'
53
54 - 'remappped_pos' - the position of the remapped read
55 """
56
57 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
58
59 assert os.path.exists(filtered_reads)
60 assert os.path.exists(remapped_reads)
61
62 assert not os.path.exists(dataset_file), 'The data_file already exists!'
63
64 joinp = os.path.join
65
66 # first read the gff file(s) and create pickled gene information
67 allGenes = [None]*6
68
69 for i in range(1,6):
70 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
71
72 print 'parsing filtered reads..'
73 frp = FilteredReadParser(filtered_reads)
74 all_filtered_reads = frp.parse()
75 print 'found %d filtered reads' % len(all_filtered_reads)
76
77 print 'parsing remapped reads'
78 rrp = RemappedReadParser(remapped_reads)
79 all_remapped_reads = rrp.parse()
80 if all_remapped_reads != None:
81 size_rem = len(all_remapped_reads)
82 else:
83 size_rem = 0
84
85 print 'found %d remapped reads' % size_rem
86
87 # this stores the new dataset
88 Sequences = []
89 Acceptors = []
90 Donors = []
91 Exons = []
92 Ests = []
93 Qualities = []
94 SplitPositions = []
95 Ids = []
96
97 #pdb.set_trace()
98
99 # Iterate over all remapped reads in order to generate for each read an
100 # training / prediction example
101 for id,currentFRead in all_filtered_reads.items():
102
103 try:
104 currentRReads = all_remapped_reads[id]
105 except:
106 currentRReads = None
107
108 gene_id = currentFRead['gene_id']
109 chr = currentFRead['chr']
110 currentGene = allGenes[chr][gene_id]
111
112 if currentFRead['strand'] != '+':
113 continue
114
115 seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
116
117 if seq == '':
118 continue
119
120 Sequences.append(seq)
121 Acceptors.append(acc)
122 Donors.append(don)
123 Exons.append(exons)
124 Ests.append(est)
125 Qualities.append(qual)
126 SplitPositions.append(spos)
127 Ids.append(id)
128
129 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
130 'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
131 'SplitPositions':SplitPositions,'Ids':Ids}
132
133 # saving new dataset
134 io_pickle.save(dataset_file,dataset)
135
136
137 def process_read(reReads,fRead,currentGene,dna_flat_files,test):
138 """
139 The purpose of this function is to create an example for QPalma
140 by using a
141
142 """
143 nil = ('','','','','','','')
144
145 chr = fRead['chr']
146 strand = fRead['strand']
147
148 chrom = 'chr%d' % chr
149 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
150
151 # use matlab-style functions to access dna sequence
152 currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
153 currentSeq = currentSeq.lower()
154
155 assert currentSeq != '', 'load_genomic returned empty sequence!'
156
157 if strand == '-':
158 try:
159 currentSeq = reverse_complement(currentSeq)
160 except:
161 print 'invalid char'
162 return nil
163
164 p_start = fRead['p_start']
165 exon_stop = fRead['exon_stop']
166 exon_start = fRead['exon_start']
167 p_stop = fRead['p_stop']
168
169 currentReadSeq = fRead['seq']
170 quality = fRead['prb']
171 spos = fRead['splitpos']
172
173 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
174 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
175 assert p_stop - p_start >= 36
176
177 currentExons = zeros((2,2),dtype=numpy.int)
178
179 currentExons[0,0] = p_start
180 currentExons[0,1] = exon_stop
181
182 currentExons[1,0] = exon_start
183 currentExons[1,1] = p_stop
184
185 # make indices relative to start pos of the current gene
186 currentExons -= currentGene.start
187
188 # determine cutting positions for sequences
189 up_cut = int(currentExons[0,0])
190 down_cut = int(currentExons[1,1])
191
192 # if we perform testing we cut a much wider region as we want to detect
193 # how good we perform on a region
194 if test:
195 up_cut = up_cut-500
196 if up_cut < 0:
197 up_cut = 0
198
199 down_cut = down_cut+500
200
201 if down_cut > len(currentSeq):
202 down_cut = len(currentSeq)
203
204 # cut out part of the sequence
205 currentSeq = currentSeq[up_cut:down_cut]
206
207 # ensure that index is correct
208 currentExons[0,1] += 1
209
210 if test:
211 currentExons -= up_cut
212 else:
213 currentExons -= currentExons[0,0]
214
215 try:
216 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
217 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
218 print 'not aligned'
219 return nil
220
221 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
222 print 'not aligned'
223 return nil
224
225 except:
226 pdb.set_trace()
227
228 # now we want to use interval_query to get the predicted splice scores
229 # trained on the TAIR sequence and annotation
230 interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
231 num_fields = 1
232 num_intervals = 1
233 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
234 % (chr,strand)
235
236 ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
237
238 # fetch acceptor scores
239 gf = Genef()
240 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
241 assert num_hits <= len(currentSeq)
242
243 #print 'Acceptor hits: %d' % num_hits
244 pos = createIntArrayFromList([0]*num_hits)
245 indices = createIntArrayFromList([0]*num_hits)
246 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
247 gf.getResults(pos,indices,scores)
248
249 acc = [-inf]*(len(currentSeq))
250
251 for i in range(num_hits):
252 position = pos[i]
253 position -= (currentGene.start+up_cut)
254 if position < 2 or position > len(currentSeq)-1:
255 continue
256 assert 0 <= position < len(currentSeq), 'position index wrong'
257 acc[position] = scores[i]
258
259 acc = acc[1:] + [-inf]
260
261 del gf
262
263 # fetch donor scores
264 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
265 % (chr,strand)
266
267 gf = Genef()
268 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
269 assert num_hits <= len(currentSeq)
270
271 gt_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and e=='g' and (currentSeq[p+1]=='t' or currentSeq[p+1]=='c')]
272
273 #print 'Donor hits: %d' % num_hits
274 pos = createIntArrayFromList([0]*num_hits)
275 indices = createIntArrayFromList([0]*num_hits)
276 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
277 gf.getResults(pos,indices,scores)
278
279 don = [-inf]*(len(currentSeq))
280
281 try:
282 for i in range(num_hits):
283 position = pos[i]
284 position -= (currentGene.start+up_cut)
285 if position < 1 or position >= len(currentSeq)-1:
286 continue
287 don[position-1] = scores[i]
288 except:
289 pdb.set_trace()
290
291 don = [-inf] + don[:-1]
292
293 assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
294 assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
295
296 assert (len(currentSeq)) == len(acc), pdb.set_trace()
297 assert (len(currentSeq)) == len(don), pdb.set_trace()
298
299 acc = [-inf] + acc[:-1]
300
301 return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
302
303
304 def reverse_complement(seq):
305 map = {'a':'t','c':'g','g':'c','t':'a'}
306
307 new_seq = [map[elem] for elem in seq]
308 new_seq.reverse()
309 new_seq = "".join(new_seq)
310
311 return new_seq
312
313 if __name__ == '__main__':
314 if len(sys.argv) == 1:
315 print info
316
317 assert len(sys.argv) == 6, help
318 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)