2 # -*- coding: utf-8 -*-
9 from qpalma
.tools
.parseGff
import parse_gff
11 from qpalma
.parsers
import FilteredReadParser
, RemappedReadParser
15 Usage of this script is:
17 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
22 The purpose of this script is to compile a full data set for QPalma out of the
25 - The information of the gff (gene annotation).
27 - The flat dna files with the sequence of the respective gene positions
29 - The solexa reads in the filtered version
31 - The solexa reads in the remapped version
33 This represents a dataset for use with QPalma
34 The entries are def'd as follows:
36 - 'id' a unique id for the read
37 - 'chr' the chromosome identifier
38 - 'strand' the identifier for the strand either '+' or '-'
45 - 'remappped_pos' - the position of the remapped read
48 def compile_d(gff_file
,dna_flat_files
,filtered_reads
,remapped_reads
,tmp_dir
,dataset_file
):
50 assert os
.path
.exists(gff_file
)
51 for file in dna_flat_files
:
52 assert os
.path
.exists(file)
54 assert os
.path
.exists(solexa_reads
)
55 assert os
.path
.exists(remapped_reads
)
57 assert not os
.path
.exists(dataset_file
), 'The data_file already exists!'
61 # first read the gff file(s) and create pickled gene information
62 allGenes
= parse_gff(gff_file
,joinp(tmp_dir
,'gff_info.pickle'))
64 dna_filename
= Conf
.dna_filename
65 est_filename
= Conf
.est_filename
67 frp
= FilteredReadParser(filtered_reads
)
68 all_filtered_reads
= frp
.parse()
70 rrp
= RemappedReadParser(remapped_reads
)
71 all_remapped_reads
= rrp
.parse()
73 # this stores the new dataset
82 # Iterate over all remapped reads in order to generate for each read an
83 # training / prediction example
84 for reRead
in all_remapped_reads
:
85 currentId
= reRead
['id']
86 fRead
= all_filtered_reads
[currentId
]
87 currentGene
= allGenes
[gene_id
]
90 genome_file
= '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
91 sscore_filename
= '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
93 seq
, acc
, don
, exons
, est
, qual
, spos
= process_read(reRead
,fRead
,currentGene
,chrom
,genome_file
,sscore_filename
)
95 if newExample
== None:
103 Qualities
.append(qual
)
104 SplitPositions
.append(spos
)
107 dataset
= {'Sequences':Sequences
, 'Acceptors':Acceptors
, 'Donors':Donors\
108 'Exons':Exons
, 'Ests':Ests
, 'Qualities':Qualities
, 'SplitPositions':SplitPositions
}
110 io_pickle
.save(dataset_file
,dataset
)
113 def process_read(reRead
,fRead
,currentGene
,chrom
,genome_file
,sscore_filename
):
115 The purpose of this function is to create an example for QPalma
120 # use matlab-style functions to access dna sequence
121 dna
= load_genomic(chrom
,'+',currentGene
.start
,currentGene
.stop
,genome_file
,one_based
=True)
125 if len(currentSeq
) < (currentGene
.stop
- currentGene
.start
):
128 p_start
= fRead
['p_start']
129 exon_stop
= fRead
['exon_stop']
130 exon_start
= fRead
['exon_start']
131 p_stop
= fRead
['p_stop']
133 currentSeq
= fRead
['seq']
135 assert p_start
< exon_stop
< exon_start
< p_stop
, 'Invalid Indices'
136 assert exon_stop
- p_start
+ p_stop
- exon_start
== 36, 'Invalid read size'
137 assert p_stop
- p_start
>= 36
139 currentExons
= zeros((2,2))
141 currentExons
[0,0] = p_start
142 currentExons
[0,1] = exon_stop
144 currentExons
[1,0] = exon_start
145 currentExons
[1,1] = p_stop
147 # make indices relative to start pos of the current gene
148 currentExons
-= currentGene
.start
150 # determine cutting positions for sequences
151 up_cut
= int(currentExons
[0,0])
152 down_cut
= int(currentExons
[1,1])
154 # if we perform testing we cut a much wider region as we want to detect
155 # how good we perform on a region
161 down_cut
= down_cut
+500
163 if down_cut
> len(currentSeq
):
164 down_cut
= len(currentSeq
)
166 # cut out part of the sequence
167 currentSeq
= currentSeq
[up_cut
:down_cut
]
169 # ensure that index is correct
170 currentExons
[0,1] += 1
173 currentExons
-= up_cut
175 currentExons
-= currentExons
[0,0]
178 if not (currentSeq
[int(currentExons
[0,1])] == 'g' and\
179 currentSeq
[int(currentExons
[0,1])+1] in ['c','t' ]):
182 if not (currentSeq
[int(currentExons
[1,0])-1] == 'g' and currentSeq
[int(currentExons
[1,0])-2] == 'a'):
188 # now we want to use interval_query to get the predicted splice scores
189 # trained on the TAIR sequence and annotation
190 from Genefinding
import *
192 interval_matrix
= createIntArrayFromList([currentGene
.start
+up_cut
,currentGene
.start
+down_cut
])
196 # fetch acceptor scores
198 num_hits
= gf
.query(sscore_filename
, ['Conf'], num_fields
, interval_matrix
, num_intervals
)
199 assert num_hits
<= len(currentSeq
)
201 #print 'Acceptor hits: %d' % num_hits
203 pos
= createIntArrayFromList([0]*num_hits
)
204 indices
= createIntArrayFromList([0]*num_hits
)
205 scores
= createDoubleArrayFromList([0]*num_hits
*num_fields
)
206 gf
.getResults(pos
,indices
,scores
)
208 acc
= [-inf
]*(len(currentSeq
)+1)
210 for i
in range(num_hits
):
212 position
-= (currentGene
.start
+up_cut
)
213 assert 0 <= position
< len(currentSeq
)+1, 'position index wrong'
214 acc
[position
] = scores
[i
]
216 acc
= acc
[1:] + [-inf
]
217 Acceptors
.append(acc
)
223 num_hits
= gf
.query(sscore_filename
, ['Conf'], num_fields
, interval_matrix
, num_intervals
)
224 assert num_hits
<= len(currentSeq
)
226 #print 'Donor hits: %d' % num_hits
227 pos
= createIntArrayFromList([0]*num_hits
)
228 indices
= createIntArrayFromList([0]*num_hits
)
229 scores
= createDoubleArrayFromList([0]*num_hits
*num_fields
)
230 gf
.getResults(pos
,indices
,scores
)
232 don
= [-inf
]*(len(currentSeq
))
235 for i
in range(1,num_hits
):
237 position
-= (currentGene
.start
+up_cut
)
238 don
[position
-1] = scores
[i
]
242 don
= [-inf
] + currentDonors
[:-1]
244 return seq
, acc
, don
, currentExons
, est
, qual
, spos
247 def reverse_complement(seq
):
248 map {'a':'t','c':'g','g':'c','t':'a'}
250 new_seq
= [map[elem
] for elem
in seq
]
256 if __name__
== '__main__':
257 if len(sys
.argv
) == 1:
260 assert len(sys
.argv
) == 6, help
262 compile_d(gff_file
,dna_flat_files
,solexa_reads
,remapped_reads
,dataset_file
):