+ modified filtering of reads
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import pdb
6 import io_pickle
7
8 import qpalma.tools
9 from qpalma.tools.parseGff import parse_gff
10
11 from qpalma.parsers import FilteredReadParser, RemappedReadParser
12
13 help = """
14
15 Usage of this script is:
16
17 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
18
19 """
20
21 info = """
22 The purpose of this script is to compile a full data set for QPalma out of the
23 following files:
24
25 - The information of the gff (gene annotation).
26
27 - The flat dna files with the sequence of the respective gene positions
28
29 - The solexa reads in the filtered version
30
31 - The solexa reads in the remapped version
32
33 This represents a dataset for use with QPalma
34 The entries are def'd as follows:
35
36 - 'id' a unique id for the read
37 - 'chr' the chromosome identifier
38 - 'strand' the identifier for the strand either '+' or '-'
39
40 - 'read_start'
41 - 'exon_stop'
42 - 'exon_start'
43 - 'read_stop'
44
45 - 'remappped_pos' - the position of the remapped read
46 """
47
48 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
49
50 assert os.path.exists(gff_file)
51 for file in dna_flat_files:
52 assert os.path.exists(file)
53
54 assert os.path.exists(solexa_reads)
55 assert os.path.exists(remapped_reads)
56
57 assert not os.path.exists(dataset_file), 'The data_file already exists!'
58
59 joinp = os.path.join
60
61 # first read the gff file(s) and create pickled gene information
62 allGenes = parse_gff(gff_file,joinp(tmp_dir,'gff_info.pickle'))
63
64 dna_filename = Conf.dna_filename
65 est_filename = Conf.est_filename
66
67 frp = FilteredReadParser(filtered_reads)
68 all_filtered_reads = frp.parse()
69
70 rrp = RemappedReadParser(remapped_reads)
71 all_remapped_reads = rrp.parse()
72
73 # this stores the new dataset
74 Sequences = []
75 Acceptors = []
76 Donors = []
77 Exons = []
78 Ests = []
79 Qualities = []
80 SplitPositions = []
81
82 # Iterate over all remapped reads in order to generate for each read an
83 # training / prediction example
84 for reRead in all_remapped_reads:
85 currentId = reRead['id']
86 fRead = all_filtered_reads[currentId]
87 currentGene = allGenes[gene_id]
88
89 chrom = 'chr1'
90 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
91 sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
92
93 seq, acc, don, exons, est, qual, spos = process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename)
94
95 if newExample == None:
96 continue
97
98 Sequences.append(seq)
99 Acceptors.append(acc)
100 Donors.append(don)
101 Exons.append(exons)
102 Ests.append(est)
103 Qualities.append(qual)
104 SplitPositions.append(spos)
105
106
107 dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors\
108 'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities, 'SplitPositions':SplitPositions}
109 # saving new dataset
110 io_pickle.save(dataset_file,dataset)
111
112
113 def process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename):
114 """
115 The purpose of this function is to create an example for QPalma
116 by using a
117
118 """
119
120 # use matlab-style functions to access dna sequence
121 dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
122 dna = dna.lower()
123 currentSeq = dna
124
125 if len(currentSeq) < (currentGene.stop - currentGene.start):
126 return None
127
128 p_start = fRead['p_start']
129 exon_stop = fRead['exon_stop']
130 exon_start = fRead['exon_start']
131 p_stop = fRead['p_stop']
132
133 currentSeq = fRead['seq']
134
135 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
136 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
137 assert p_stop - p_start >= 36
138
139 currentExons = zeros((2,2))
140
141 currentExons[0,0] = p_start
142 currentExons[0,1] = exon_stop
143
144 currentExons[1,0] = exon_start
145 currentExons[1,1] = p_stop
146
147 # make indices relative to start pos of the current gene
148 currentExons -= currentGene.start
149
150 # determine cutting positions for sequences
151 up_cut = int(currentExons[0,0])
152 down_cut = int(currentExons[1,1])
153
154 # if we perform testing we cut a much wider region as we want to detect
155 # how good we perform on a region
156 if test:
157 up_cut = up_cut-500
158 if up_cut < 0:
159 up_cut = 0
160
161 down_cut = down_cut+500
162
163 if down_cut > len(currentSeq):
164 down_cut = len(currentSeq)
165
166 # cut out part of the sequence
167 currentSeq = currentSeq[up_cut:down_cut]
168
169 # ensure that index is correct
170 currentExons[0,1] += 1
171
172 if test:
173 currentExons -= up_cut
174 else:
175 currentExons -= currentExons[0,0]
176
177 try:
178 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
179 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
180 continue
181
182 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
183 continue
184
185 except:
186 pdb.set_trace()
187
188 # now we want to use interval_query to get the predicted splice scores
189 # trained on the TAIR sequence and annotation
190 from Genefinding import *
191
192 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
193 num_fields = 1
194 num_intervals = 1
195
196 # fetch acceptor scores
197 gf = Genef()
198 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
199 assert num_hits <= len(currentSeq)
200
201 #print 'Acceptor hits: %d' % num_hits
202
203 pos = createIntArrayFromList([0]*num_hits)
204 indices = createIntArrayFromList([0]*num_hits)
205 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
206 gf.getResults(pos,indices,scores)
207
208 acc = [-inf]*(len(currentSeq)+1)
209
210 for i in range(num_hits):
211 position = pos[i]
212 position -= (currentGene.start+up_cut)
213 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
214 acc[position] = scores[i]
215
216 acc = acc[1:] + [-inf]
217 Acceptors.append(acc)
218
219 del gf
220
221 # fetch donor scores
222 gf = Genef()
223 num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
224 assert num_hits <= len(currentSeq)
225
226 #print 'Donor hits: %d' % num_hits
227 pos = createIntArrayFromList([0]*num_hits)
228 indices = createIntArrayFromList([0]*num_hits)
229 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
230 gf.getResults(pos,indices,scores)
231
232 don = [-inf]*(len(currentSeq))
233
234 try:
235 for i in range(1,num_hits):
236 position = pos[i]
237 position -= (currentGene.start+up_cut)
238 don[position-1] = scores[i]
239 except:
240 pdb.set_trace()
241
242 don = [-inf] + currentDonors[:-1]
243
244 return seq, acc, don, currentExons, est, qual, spos
245
246
247 def reverse_complement(seq):
248 map {'a':'t','c':'g','g':'c','t':'a'}
249
250 new_seq = [map[elem] for elem in seq]
251 new_seq.reverse()
252
253 return new_seq
254
255
256 if __name__ == '__main__':
257 if len(sys.argv) == 1:
258 print info
259
260 assert len(sys.argv) == 6, help
261
262 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file):