+ added script to generate datasets
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import pdb
6 import io_pickle
7
8 import qpalma.tools
9 from qpalma.tools.parseGff import parse_gff
10
11 from qpalma.parsers import FilteredReadParser, RemappedReadParser
12
13 help = """
14
15 Usage of this script is:
16
17 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
18
19 """
20
21 info = """
22 The purpose of this script is to compile a full data set for QPalma out of the
23 following files:
24
25 - The information of the gff (gene annotation).
26
27 - The flat dna files with the sequence of the respective gene positions
28
29 - The solexa reads in the filtered version
30
31 - The solexa reads in the remapped version
32
33 This represents a dataset for use with QPalma
34 The entries are def'd as follows:
35
36 - 'id' a unique id for the read
37 - 'chr' the chromosome identifier
38 - 'strand' the identifier for the strand either '+' or '-'
39
40 - 'read_start'
41 - 'exon_stop'
42 - 'exon_start'
43 - 'read_stop'
44
45 - 'remappped_pos' - the position of the remapped read
46 """
47
48 def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,dataset_file):
49
50 assert os.path.exists(gff_file)
51 for file in dna_flat_files:
52 assert os.path.exists(file)
53
54 assert os.path.exists(solexa_reads)
55 assert os.path.exists(remapped_reads)
56
57 assert not os.path.exists(dataset_file), 'The data_file already exists!'
58
59 joinp = os.path.join
60
61 # first read the gff file(s) and create pickled gene information
62 allGenes = parse_gff(gff_file,joinp(tmp_dir,'gff_info.pickle'))
63
64 dna_filename = Conf.dna_filename
65 est_filename = Conf.est_filename
66
67 Sequences = []
68 Acceptors = []
69 Donors = []
70 Exons = []
71 Ests = []
72 Qualities = []
73 SplitPositions = []
74
75 frp = FilteredReadParser('')
76 rrp = RemappedReadParser('est_filename')
77
78
79 # Iterate over all reads
80 for line in open(est_filename):
81 line = line.strip()
82 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
83 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
84 id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
85
86 splitpos = int(splitpos)
87 length = int(length)
88 prb = [ord(elem)-50 for elem in prb]
89 cal = [ord(elem)-64 for elem in cal]
90 chastity = [ord(elem)+10 for elem in chastity]
91
92 assert len(prb) == len(seq)
93
94 currentGene = allGenes[gene_id]
95 seq = seq.lower()
96
97 #try:
98 # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
99 #except:
100 # continue
101
102 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
103 chrom = 'chr1'
104
105 # use matlab-style functions to access dna sequence
106 dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
107 currentSeq = dna
108
109 if len(currentSeq) < (currentGene.stop - currentGene.start):
110 continue
111
112 # compare both sequences
113 #assert dna == currentSeq, pdb.set_trace()
114
115 p_start = int(p_start)
116 exon_stop = int(exon_stop)
117 exon_start = int(exon_start)
118 p_stop = int(p_stop)
119
120 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
121 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
122 assert p_stop - p_start >= 36
123
124 currentExons = zeros((2,2))
125
126 currentExons[0,0] = p_start
127 currentExons[0,1] = exon_stop
128
129 currentExons[1,0] = exon_start
130 currentExons[1,1] = p_stop
131
132 # make indices relative to start pos of the current gene
133 currentExons -= currentGene.start
134
135 # determine cutting positions for sequences
136 up_cut = int(currentExons[0,0])
137 down_cut = int(currentExons[1,1])
138
139 # if we perform testing we cut a much wider region as we want to detect
140 # how good we perform on a region
141 if test:
142 up_cut = up_cut-500
143 if up_cut < 0:
144 up_cut = 0
145
146 down_cut = down_cut+500
147
148 if down_cut > len(currentSeq):
149 down_cut = len(currentSeq)
150
151 # cut out part of the sequence
152 currentSeq = currentSeq[up_cut:down_cut]
153
154 # ensure that index is correct
155 currentExons[0,1] += 1
156
157 if test:
158 currentExons -= up_cut
159 else:
160 currentExons -= currentExons[0,0]
161
162 try:
163 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
164 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
165 continue
166
167 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
168 continue
169
170 except:
171 pdb.set_trace()
172
173 Sequences.append(currentSeq)
174
175 # now we want to use interval_query to get the predicted splice scores
176 # trained on the TAIR sequence and annotation
177
178 from Genefinding import *
179
180 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
181 num_fields = 1
182 num_intervals = 1
183
184 # fetch acceptor scores
185 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
186 gf = Genef()
187 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
188 assert num_hits <= len(currentSeq)
189
190 #print 'Acceptor hits: %d' % num_hits
191
192 pos = createIntArrayFromList([0]*num_hits)
193 indices = createIntArrayFromList([0]*num_hits)
194 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
195 gf.getResults(pos,indices,scores)
196
197 currentAcceptors = [-inf]*(len(currentSeq)+1)
198
199 for i in range(num_hits):
200 position = pos[i]
201 position -= (currentGene.start+up_cut)
202 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
203 currentAcceptors[position] = scores[i]
204
205 currentAcceptors = currentAcceptors[1:] + [-inf]
206 Acceptors.append(currentAcceptors)
207
208 del gf
209
210 # fetch donor scores
211 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
212 gf = Genef()
213 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
214 assert num_hits <= len(currentSeq)
215
216 #print 'Donor hits: %d' % num_hits
217 pos = createIntArrayFromList([0]*num_hits)
218 indices = createIntArrayFromList([0]*num_hits)
219 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
220 gf.getResults(pos,indices,scores)
221
222 currentDonors = [-inf]*(len(currentSeq))
223
224 try:
225 for i in range(1,num_hits):
226 position = pos[i]
227 position -= (currentGene.start+up_cut)
228 currentDonors[position-1] = scores[i]
229 except:
230 pdb.set_trace()
231
232 currentDonors = [-inf] + currentDonors[:-1]
233 Donors.append(currentDonors)
234
235 Exons.append(currentExons)
236 Ests.append(seq)
237 Qualities.append(prb)
238
239 SplitPositions.append(splitpos)
240
241 #if len(Sequences[-1]) == 2755:
242 # Sequences = Sequences[:-1]
243 # Acceptors = Acceptors[:-1]
244 # Donors = Donors[:-1]
245 # Exons = Exons[:-1]
246 # Ests = Ests[:-1]
247 # Qualities = Qualities[:-1]
248 # SplitPositions = SplitPositions[:-1]
249
250 #print 'found %d examples' % len(Sequences)
251
252 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
253
254
255
256
257 dataset = []
258
259 {'id':'', 'chr':'', '', 'strand':, = line.split()
260
261
262 io_pickle.save(dataset_file,dataset)
263
264
265 if __name__ == '__main__':
266
267 if len(sys.argv) == 1:
268 print info
269
270 assert len(sys.argv) == 6, help
271
272 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file):