+ resolved some index bugs concerning the intron boundaries resp. splicesite
[qpalma.git] / qpalma / DataProc.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7 import Configuration as Conf
8 from PyGff import *
9 import io_pickle
10 import scipy.io
11 import pdb
12
13
14 def paths_load_data_solexa(expt,genome_info,PAR):
15 # expt can be 'training','validation' or 'test'
16 assert expt in ['training','validation','test']
17
18 dna_filename = Conf.dna_filename
19 est_filename = Conf.est_filename
20 tair7_seq_filename = Conf.tair7_seq_filename
21
22 tair7_seq = cPickle.load(open(tair7_seq_filename))
23 allGenes = cPickle.load(open(dna_filename))
24
25 Sequences = []
26 Acceptors = []
27 Donors = []
28 Exons = []
29 Ests = []
30 Qualities = []
31 SplitPositions = []
32
33 for line in open(est_filename):
34 line = line.strip()
35 chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,exon_idx = line.split()
36 splitpos = int(splitpos)
37 length = int(length)
38 prb = [ord(elem)-50 for elem in prb]
39 cal = [ord(elem)-64 for elem in cal]
40 chastity = [ord(elem)+10 for elem in chastity]
41
42 assert len(prb) == len(seq)
43
44 currentGene = allGenes[gene_id]
45 seq = seq.lower()
46
47 try:
48 currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
49 except:
50 continue
51
52 #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
53 #Acceptors.append([-inf]*currentSize)
54 #Donors.append([-inf]*currentSize)
55
56 exon_idx = int(exon_idx)
57
58 assert exon_idx > 0
59 #print "%d " % exon_idx,
60 currentExons = zeros((2,2))
61 #for idx in range(len(currentGene.exons)):
62 # currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
63 # currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
64
65 try:
66 currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
67 currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
68
69 currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
70 currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
71
72 cut_offset = 500
73
74 up_cut = int(currentExons[0,0]) - cut_offset
75 down_cut = int(currentExons[1,1]) + cut_offset
76 if up_cut < 0:
77 up_cut = 0
78
79 if down_cut > len(currentSeq):
80 down_cut = len(currentSeq)
81
82 assert down_cut - up_cut > 0
83 assert down_cut - up_cut <= len(currentSeq)
84
85 currentSeq = currentSeq[up_cut:down_cut]
86
87 #pdb.set_trace()
88
89 if up_cut > 0:
90 currentExons[0,0] -= up_cut
91 currentExons[0,1] -= up_cut
92
93 currentExons[1,0] -= up_cut
94 currentExons[1,1] -= up_cut
95
96 except:
97 continue
98 #pdb.set_trace()
99
100 #pdb.set_trace()
101
102 currentExons[0,1] += 1
103 #currentExons[1,0] += 1
104
105 Sequences.append(currentSeq)
106
107 currentSize = len(Sequences[-1])
108 Acceptors.append([0]*currentSize)
109 Donors.append([0]*currentSize)
110
111 Exons.append(currentExons)
112 Ests.append(seq)
113 Qualities.append(prb)
114
115 SplitPositions.append(splitpos)
116
117 if len(Sequences[-1]) == 2755:
118 Sequences = Sequences[:-1]
119 Acceptors = Acceptors[:-1]
120 Donors = Donors[:-1]
121 Exons = Exons[:-1]
122 Ests = Ests[:-1]
123 Qualities = Qualities[:-1]
124 SplitPositions = SplitPositions[:-1]
125
126 print 'found %d examples' % len(Sequences)
127
128 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
129
130
131 def paths_load_data_pickle(expt,genome_info,PAR):
132 """
133
134 """
135
136 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
137 # Load the relevant file and return the alignment data
138
139 # expt can be 'training','validation' or 'test'
140
141 assert expt in ['training','validation','test']
142
143 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
144
145 Noises = [];
146
147 if expt == 'training':
148 if PAR.microexon:
149 if PAR.LOCAL_ALIGN: # local version
150
151 train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
152 data = cPickle.load(open(train_data))
153
154 else: # global version
155 pass
156
157
158 else:
159 train_data = '%s/exons_train_local.pickle' % genome_info.basedir
160 data = cPickle.load(open(train_data))
161
162 print 'train_data is %s' % train_data
163
164 Sequences = data['Train'] # dna sequences
165 Acceptors = data['TrainAcc'] # acceptor scores
166 Donors = data['TrainDon'] # donor scores
167 Exons = data['TrainExon'] # exon boundaries
168 Ests = data['TrainEsts'] # est sequences
169
170 # Lower all indices by one to convert matlab
171 # to python indices
172
173 Exons -= 1
174
175 return Sequences, Acceptors, Donors, Exons, Ests, Noises
176
177 def paths_load_data(expt,genome_info,PAR):
178 """
179
180 """
181
182 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
183 # Load the relevant file and return the alignment data
184
185 # expt can be 'training','validation' or 'test'
186
187 assert expt in ['training','validation','test']
188
189 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
190
191 Noises = [];
192
193 if expt == 'training':
194 if PAR.microexon:
195 if PAR.LOCAL_ALIGN: # local version
196
197 train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
198 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
199 #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
200 #io_pickle.convert_v6(train_data,train_data_pickle)
201 #train_data = io_pickle.load(train_data_pickle)
202 data = scipy.io.loadmat(train_data)
203
204 else: # global version
205
206 train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
207 (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
208
209 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
210 #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
211 # (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
212
213 #io_pickle.convert_v6(train_data,train_data_pickle)
214 #train_data = io_pickle.load(train_data_pickle)
215 data = scipy.io.loadmat(train_data)
216 Noises = data['TrainNoise'] # substitution matrix
217
218 else:
219 train_data = '%s/exons_train_local.mat' % genome_info.basedir
220 #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
221 #io_pickle.convert_v6(train_data,train_data_pickle)
222 #microexon_train_data = io_pickle.load(train_data_pickle)
223 data = scipy.io.loadmat(train_data)
224
225 print 'train_data is %s' % train_data
226
227 Sequences = data['Train'] # dna sequences
228 Acceptors = data['TrainAcc'] # acceptor scores
229 Donors = data['TrainDon'] # donor scores
230 Exons = data['TrainExon'] # exon boundaries
231 Ests = data['TrainEsts'] # est sequences
232
233 #elif expt == 'validation':
234 # print('Loading validation data\n') ;
235 # if PAR.microexon,
236 # if PAR.LOCAL_ALIGN
237 # %local version
238 # load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
239 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
240 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
241 # else
242 # %global version
243 # load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
244 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
245 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
246 # end
247 # else
248 # load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
249 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
250 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
251 # end
252 #
253 # Sequences = Val ; % dna sequences
254 # Acceptors = ValAcc ; % acceptor scores
255 # Donors = ValDon ; % donor scores
256 # Exons = ValExon ; % exon boundaries
257 # Ests = ValEsts ; % est sequences
258
259
260
261 #elif expt == 'test':
262 # fprintf('Loading test data\n') ;
263 # if PAR.microexon,
264 # if PAR.LOCAL_ALIGN
265 # %local version
266 # load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
267 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
268 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
269 # else
270 # %global version
271 # load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
272 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
273 # 'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
274 # Noises = TestNoise ; % substitution matrix
275 # end
276 # else
277 # load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
278 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
279 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
280 # end
281 #
282 # Sequences = Test ; % dna sequences
283 # Acceptors = TestAcc ; % acceptor scores
284 # Donors = TestDon ; % donor scores
285 # Exons = TestExon ; % exon boundaries
286 # Ests = TestEsts ; % est sequences
287
288 # Lower all indices by one to convert matlab
289 # to python indices
290
291 Exons -= 1
292
293 return Sequences, Acceptors, Donors, Exons, Ests, Noises