+ minor bugfixes in the compile_dataset script
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import pdb
7 import io_pickle
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11
12 import qpalma
13 import qpalma.tools
14 from qpalma.tools.parseGff import parse_gff
15
16 from qpalma.parsers import FilteredReadParser, RemappedReadParser
17
18 from Genefinding import *
19
20 from genome_utils import load_genomic
21
22 help = """
23
24 Usage of this script is:
25
26 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
27
28 """
29
30 info = """
31 The purpose of this script is to compile a full data set for QPalma out of the
32 following files:
33
34 - The information of the gff (gene annotation).
35
36 - The flat dna files with the sequence of the respective gene positions
37
38 - The solexa reads in the filtered version
39
40 - The solexa reads in the remapped version
41
42 This represents a dataset for use with QPalma
43 The entries are def'd as follows:
44
45 - 'id' a unique id for the read
46 - 'chr' the chromosome identifier
47 - 'strand' the identifier for the strand either '+' or '-'
48
49 - 'read_start'
50 - 'exon_stop'
51 - 'exon_start'
52 - 'read_stop'
53
54 - 'remappped_pos' - the position of the remapped read
55 """
56
57 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
58
59 assert os.path.exists(gff_file)
60 #for file in dna_flat_files:
61 # assert os.path.exists(file)
62
63 assert os.path.exists(filtered_reads)
64 assert os.path.exists(remapped_reads)
65
66 assert not os.path.exists(dataset_file), 'The data_file already exists!'
67
68 joinp = os.path.join
69
70 # first read the gff file(s) and create pickled gene information
71 allGenes = parse_gff(gff_file) #,joinp(tmp_dir,'gff_info.pickle'))
72
73 print 'parsing filtered reads'
74 frp = FilteredReadParser(filtered_reads)
75
76 print 'parsing remapped reads'
77 all_filtered_reads = frp.parse()
78
79 rrp = RemappedReadParser(remapped_reads)
80 all_remapped_reads = rrp.parse()
81
82 # this stores the new dataset
83 Sequences = []
84 Acceptors = []
85 Donors = []
86 Exons = []
87 Ests = []
88 Qualities = []
89 SplitPositions = []
90 Ids = []
91
92 # Iterate over all remapped reads in order to generate for each read an
93 # training / prediction example
94 for id,currentFRead in all_filtered_reads.items():
95
96 try:
97 currentRReads = all_remapped_reads[id]
98 except:
99 currentRReads = None
100
101 gene_id = currentFRead['gene_id']
102 currentGene = allGenes[gene_id]
103
104 #seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,genome_file,sscore_filename)
105 seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene)
106
107 if seq == '':
108 continue
109
110 Sequences.append(seq)
111 Acceptors.append(acc)
112 Donors.append(don)
113 Exons.append(exons)
114 Ests.append(est)
115 Qualities.append(qual)
116 SplitPositions.append(spos)
117 Ids.append(id)
118
119 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
120 'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
121 'SplitPositions':SplitPositions,'Ids':Ids}
122
123 # saving new dataset
124 io_pickle.save(dataset_file,dataset)
125
126
127 #def process_read(reReads,fRead,currentGene,genome_file,sscore_filename):
128 def process_read(reReads,fRead,currentGene):
129 """
130 The purpose of this function is to create an example for QPalma
131 by using a
132
133 """
134 chr = fRead['chr']
135 strand = fRead['strand']
136
137 chrom = 'chr%d' % chr
138 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
139 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
140
141 # use matlab-style functions to access dna sequence
142 currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,genome_file,one_based=True)
143 currentSeq = currentSeq.lower()
144
145 if strand == '-':
146 currentSeq = reverse_complement(currentSeq)
147
148 nil = ('','','','','','','')
149
150 if len(currentSeq) < (currentGene.stop - currentGene.start):
151 return nil
152
153 p_start = fRead['p_start']
154 exon_stop = fRead['exon_stop']
155 exon_start = fRead['exon_start']
156 p_stop = fRead['p_stop']
157
158 currentReadSeq = fRead['seq']
159 quality = fRead['prb']
160 spos = fRead['splitpos']
161
162 #if fRead['chr'] != 1 and fRead['strand'] != 'D':
163 # return nil
164
165 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
166 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
167 assert p_stop - p_start >= 36
168
169 currentExons = zeros((2,2),dtype=numpy.int)
170
171 currentExons[0,0] = p_start
172 currentExons[0,1] = exon_stop
173
174 currentExons[1,0] = exon_start
175 currentExons[1,1] = p_stop
176
177 # make indices relative to start pos of the current gene
178 currentExons -= currentGene.start
179
180 # determine cutting positions for sequences
181 up_cut = int(currentExons[0,0])
182 down_cut = int(currentExons[1,1])
183
184 # if we perform testing we cut a much wider region as we want to detect
185 # how good we perform on a region
186 test = False
187
188 if test:
189 up_cut = up_cut-500
190 if up_cut < 0:
191 up_cut = 0
192
193 down_cut = down_cut+500
194
195 if down_cut > len(currentSeq):
196 down_cut = len(currentSeq)
197
198 # cut out part of the sequence
199 currentSeq = currentSeq[up_cut:down_cut]
200
201 # ensure that index is correct
202 currentExons[0,1] += 1
203
204 if test:
205 currentExons -= up_cut
206 else:
207 currentExons -= currentExons[0,0]
208
209 try:
210 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
211 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
212 return nil
213
214 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
215 return nil
216
217 except:
218 pdb.set_trace()
219
220 # now we want to use interval_query to get the predicted splice scores
221 # trained on the TAIR sequence and annotation
222
223 interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
224 num_fields = 1
225 num_intervals = 1
226 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
227
228 ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
229
230 # fetch acceptor scores
231 gf = Genef()
232 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
233 assert num_hits <= len(currentSeq)
234
235 #print 'Acceptor hits: %d' % num_hits
236 pos = createIntArrayFromList([0]*num_hits)
237 indices = createIntArrayFromList([0]*num_hits)
238 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
239 gf.getResults(pos,indices,scores)
240
241 acc = [-inf]*(len(currentSeq))
242
243 for i in range(num_hits):
244 position = pos[i]
245 position -= (currentGene.start+up_cut)
246 if position < 2 or position > len(currentSeq)-1:
247 continue
248 assert 0 <= position < len(currentSeq), 'position index wrong'
249 acc[position] = scores[i]
250
251 acc = acc[1:] + [-inf]
252
253 del gf
254
255 # fetch donor scores
256 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
257 gf = Genef()
258 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
259 assert num_hits <= len(currentSeq)
260
261 gt_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and e=='g' and (currentSeq[p+1]=='t' or currentSeq[p+1]=='c')]
262
263 #print 'Donor hits: %d' % num_hits
264 pos = createIntArrayFromList([0]*num_hits)
265 indices = createIntArrayFromList([0]*num_hits)
266 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
267 gf.getResults(pos,indices,scores)
268
269 don = [-inf]*(len(currentSeq))
270
271 try:
272 for i in range(num_hits):
273 position = pos[i]
274 position -= (currentGene.start+up_cut)
275 if position < 1 or position >= len(currentSeq)-1:
276 continue
277 don[position-1] = scores[i]
278 except:
279 pdb.set_trace()
280
281 don = [-inf] + don[:-1]
282
283 #for p,e in enumerate(acc):
284 # if p not in ag_tuple_pos:
285 # acc[p] = -inf
286
287 #for p,e in enumerate(don):
288 # if p not in gt_tuple_pos:
289 # don[p] = -inf
290
291 assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
292 assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
293
294 assert (len(currentSeq)) == len(acc), pdb.set_trace()
295 assert (len(currentSeq)) == len(don), pdb.set_trace()
296
297 acc = [-inf] + acc[:-1]
298
299 return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
300
301
302 def reverse_complement(seq):
303 map = {'a':'t','c':'g','g':'c','t':'a'}
304
305 new_seq = [map[elem] for elem in seq]
306 new_seq.reverse()
307 new_seq = "".join(new_seq)
308
309 return new_seq
310
311
312 if __name__ == '__main__':
313 if len(sys.argv) == 1:
314 print info
315
316 assert len(sys.argv) == 6, help
317 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)