+ minor changes
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import pdb
7 import io_pickle
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11
12 import qpalma
13 import qpalma.tools
14 from qpalma.tools.parseGff import parse_gff
15
16 from qpalma.parsers import FilteredReadParser, RemappedReadParser
17
18 from Genefinding import *
19
20 from genome_utils import load_genomic
21
22 help = """
23
24 Usage of this script is:
25
26 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
27
28 """
29
30 info = """
31 The purpose of this script is to compile a full data set for QPalma out of the
32 following files:
33
34 - The information of the gff (gene annotation).
35
36 - The flat dna files with the sequence of the respective gene positions
37
38 - The solexa reads in the filtered version
39
40 - The solexa reads in the remapped version
41
42 This represents a dataset for use with QPalma
43 The entries are def'd as follows:
44
45 - 'id' a unique id for the read
46 - 'chr' the chromosome identifier
47 - 'strand' the identifier for the strand either '+' or '-'
48
49 - 'read_start'
50 - 'exon_stop'
51 - 'exon_start'
52 - 'read_stop'
53
54 - 'remappped_pos' - the position of the remapped read
55 """
56
57 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
58
59 assert os.path.exists(gff_file)
60 #for file in dna_flat_files:
61 # assert os.path.exists(file)
62
63 assert os.path.exists(filtered_reads)
64 assert os.path.exists(remapped_reads)
65
66 assert not os.path.exists(dataset_file), 'The data_file already exists!'
67
68 joinp = os.path.join
69
70 # first read the gff file(s) and create pickled gene information
71 allGenes = parse_gff(gff_file) #,joinp(tmp_dir,'gff_info.pickle'))
72
73 print 'parsing filtered reads'
74 frp = FilteredReadParser(filtered_reads)
75
76 print 'parsing remapped reads'
77 all_filtered_reads = frp.parse()
78
79 rrp = RemappedReadParser(remapped_reads)
80 all_remapped_reads = rrp.parse()
81
82 # this stores the new dataset
83 Sequences = []
84 Acceptors = []
85 Donors = []
86 Exons = []
87 Ests = []
88 Qualities = []
89 SplitPositions = []
90 Ids = []
91
92 # Iterate over all remapped reads in order to generate for each read an
93 # training / prediction example
94 for id,currentFRead in all_filtered_reads.items():
95
96 try:
97 currentRReads = all_remapped_reads[id]
98 except:
99 currentRReads = None
100
101 gene_id = currentFRead['gene_id']
102 currentGene = allGenes[gene_id]
103
104 chrom = 'chr1'
105 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
106 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
107
108 seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,chrom,genome_file,sscore_filename)
109
110 if seq == '':
111 continue
112
113 Sequences.append(seq)
114 Acceptors.append(acc)
115 Donors.append(don)
116 Exons.append(exons)
117 Ests.append(est)
118 Qualities.append(qual)
119 SplitPositions.append(spos)
120 Ids.append(id)
121
122 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
123 'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities,\
124 'SplitPositions':SplitPositions,'Ids':Ids}
125
126 # saving new dataset
127 io_pickle.save(dataset_file,dataset)
128
129
130 def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
131 """
132 The purpose of this function is to create an example for QPalma
133 by using a
134
135 """
136 # use matlab-style functions to access dna sequence
137 currentSeq = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
138 currentSeq = currentSeq.lower()
139
140 nil = ('','','','','','','')
141
142 if len(currentSeq) < (currentGene.stop - currentGene.start):
143 return nil
144
145 p_start = fRead['p_start']
146 exon_stop = fRead['exon_stop']
147 exon_start = fRead['exon_start']
148 p_stop = fRead['p_stop']
149
150 currentReadSeq = fRead['seq']
151 quality = fRead['prb']
152 spos = fRead['splitpos']
153
154 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
155 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
156 assert p_stop - p_start >= 36
157
158 currentExons = zeros((2,2),dtype=numpy.int)
159
160 currentExons[0,0] = p_start
161 currentExons[0,1] = exon_stop
162
163 currentExons[1,0] = exon_start
164 currentExons[1,1] = p_stop
165
166 # make indices relative to start pos of the current gene
167 currentExons -= currentGene.start
168
169 # determine cutting positions for sequences
170 up_cut = int(currentExons[0,0])
171 down_cut = int(currentExons[1,1])
172
173 # if we perform testing we cut a much wider region as we want to detect
174 # how good we perform on a region
175 test = False
176
177 if test:
178 up_cut = up_cut-500
179 if up_cut < 0:
180 up_cut = 0
181
182 down_cut = down_cut+500
183
184 if down_cut > len(currentSeq):
185 down_cut = len(currentSeq)
186
187 # cut out part of the sequence
188 currentSeq = currentSeq[up_cut:down_cut]
189
190 # ensure that index is correct
191 currentExons[0,1] += 1
192
193 if test:
194 currentExons -= up_cut
195 else:
196 currentExons -= currentExons[0,0]
197
198 try:
199 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
200 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
201 return nil
202
203 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
204 return nil
205
206 except:
207 pdb.set_trace()
208
209 # now we want to use interval_query to get the predicted splice scores
210 # trained on the TAIR sequence and annotation
211
212 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
213 num_fields = 1
214 num_intervals = 1
215
216 # fetch acceptor scores
217 gf = Genef()
218 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
219 assert num_hits <= len(currentSeq)
220
221 #print 'Acceptor hits: %d' % num_hits
222 pos = createIntArrayFromList([0]*num_hits)
223 indices = createIntArrayFromList([0]*num_hits)
224 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
225 gf.getResults(pos,indices,scores)
226
227 acc = [-inf]*(len(currentSeq)+1)
228
229 for i in range(num_hits):
230 position = pos[i]
231 position -= (currentGene.start+up_cut)
232 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
233 acc[position] = scores[i]
234
235 acc = acc[1:] + [-inf]
236
237 del gf
238
239 # fetch donor scores
240 gf = Genef()
241 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
242 assert num_hits <= len(currentSeq)
243
244 #print 'Donor hits: %d' % num_hits
245 pos = createIntArrayFromList([0]*num_hits)
246 indices = createIntArrayFromList([0]*num_hits)
247 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
248 gf.getResults(pos,indices,scores)
249
250 don = [-inf]*(len(currentSeq))
251
252 try:
253 for i in range(1,num_hits):
254 position = pos[i]
255 position -= (currentGene.start+up_cut)
256 don[position-1] = scores[i]
257 except:
258 pdb.set_trace()
259
260 don = [-inf] + don[:-1]
261
262 return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
263
264 def reverse_complement(seq):
265 map = {'a':'t','c':'g','g':'c','t':'a'}
266
267 new_seq = [map[elem] for elem in seq]
268 new_seq.reverse()
269
270 return new_seq
271
272
273 if __name__ == '__main__':
274 if len(sys.argv) == 1:
275 print info
276
277 assert len(sys.argv) == 6, help
278 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)