+ added exons read boundary output to filterReads output
[qpalma.git] / qpalma / DataProc.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7 import Configuration as Conf
8 from PyGff import *
9 import io_pickle
10 import scipy.io
11 import pdb
12
13 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
14 # expt can be 'training','validation' or 'test'
15 assert expt in ['training','validation','test']
16
17 dna_filename = Conf.dna_filename
18 est_filename = Conf.est_filename
19 tair7_seq_filename = Conf.tair7_seq_filename
20
21 tair7_seq = cPickle.load(open(tair7_seq_filename))
22 allGenes = cPickle.load(open(dna_filename))
23
24 Sequences = []
25 Acceptors = []
26 Donors = []
27 Exons = []
28 Ests = []
29 Qualities = []
30 SplitPositions = []
31
32 for line in open(est_filename):
33 line = line.strip()
34 chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
35 splitpos = int(splitpos)
36 length = int(length)
37 prb = [ord(elem)-50 for elem in prb]
38 cal = [ord(elem)-64 for elem in cal]
39 chastity = [ord(elem)+10 for elem in chastity]
40
41 assert len(prb) == len(seq)
42
43 currentGene = allGenes[gene_id]
44 seq = seq.lower()
45
46 try:
47 currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
48 except:
49 continue
50
51 if len(currentSeq) < (currentGene.stop - currentGene.start):
52 continue
53
54 oldSeq = currentSeq
55
56 p_start = int(p_start)
57 exon_stop = int(exon_stop)
58 exon_start = int(exon_start)
59 p_stop = int(p_stop)
60
61 assert p_start <= exon_stop <= exon_start <= p_stop, 'Invalid Indices'
62 assert p_stop - p_start >= 36
63
64 currentExons = zeros((2,2))
65
66 currentExons[0,0] = p_start-currentGene.start
67 currentExons[0,1] = exon_stop-currentGene.start
68
69 currentExons[1,0] = exon_start-currentGene.start
70 currentExons[1,1] = p_stop-currentGene.start
71
72 import copy
73 oldExons = copy.deepcopy(currentExons)
74
75 up_cut = int(currentExons[0,0])
76 down_cut = int(currentExons[1,1])
77
78 if test:
79 up_cut = up_cut-500
80 if up_cut < 0:
81 up_cut = 0
82
83 down_cut = down_cut+500
84
85 if down_cut > len(currentSeq):
86 down_cut = len(currentSeq)
87
88 currentSeq = currentSeq[up_cut:down_cut]
89
90 currentExons[0,1] += 1
91 #currentExons[1,0] += 1
92
93 if test:
94 currentExons -= up_cut
95 else:
96 currentExons -= currentExons[0,0]
97
98 try:
99 if not (currentSeq[int(currentExons[0,1])] == 'g' and currentSeq[int(currentExons[0,1])+1] == 't'):
100 continue
101
102 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
103 continue
104
105 except:
106 pdb.set_trace()
107
108 Sequences.append(currentSeq)
109
110 currentSize = len(Sequences[-1])
111 Acceptors.append([0]*currentSize)
112 Donors.append([0]*currentSize)
113
114 Exons.append(currentExons)
115 Ests.append(seq)
116 Qualities.append(prb)
117
118 SplitPositions.append(splitpos)
119
120 #if len(Sequences[-1]) == 2755:
121 # Sequences = Sequences[:-1]
122 # Acceptors = Acceptors[:-1]
123 # Donors = Donors[:-1]
124 # Exons = Exons[:-1]
125 # Ests = Ests[:-1]
126 # Qualities = Qualities[:-1]
127 # SplitPositions = SplitPositions[:-1]
128
129 print 'found %d examples' % len(Sequences)
130
131 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
132
133
134 def paths_load_data_pickle(expt,genome_info,PAR):
135 """
136
137 """
138
139 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
140 # Load the relevant file and return the alignment data
141
142 # expt can be 'training','validation' or 'test'
143
144 assert expt in ['training','validation','test']
145
146 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
147
148 Noises = [];
149
150 if expt == 'training':
151 if PAR.microexon:
152 if PAR.LOCAL_ALIGN: # local version
153
154 train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
155 data = cPickle.load(open(train_data))
156
157 else: # global version
158 pass
159
160
161 else:
162 train_data = '%s/exons_train_local.pickle' % genome_info.basedir
163 data = cPickle.load(open(train_data))
164
165 print 'train_data is %s' % train_data
166
167 Sequences = data['Train'] # dna sequences
168 Acceptors = data['TrainAcc'] # acceptor scores
169 Donors = data['TrainDon'] # donor scores
170 Exons = data['TrainExon'] # exon boundaries
171 Ests = data['TrainEsts'] # est sequences
172
173 # Lower all indices by one to convert matlab
174 # to python indices
175
176 Exons -= 1
177
178 return Sequences, Acceptors, Donors, Exons, Ests, Noises
179
180 def paths_load_data(expt,genome_info,PAR):
181 """
182
183 """
184
185 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
186 # Load the relevant file and return the alignment data
187
188 # expt can be 'training','validation' or 'test'
189
190 assert expt in ['training','validation','test']
191
192 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
193
194 Noises = [];
195
196 if expt == 'training':
197 if PAR.microexon:
198 if PAR.LOCAL_ALIGN: # local version
199
200 train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
201 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
202 #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
203 #io_pickle.convert_v6(train_data,train_data_pickle)
204 #train_data = io_pickle.load(train_data_pickle)
205 data = scipy.io.loadmat(train_data)
206
207 else: # global version
208
209 train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
210 (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
211
212 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
213 #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
214 # (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
215
216 #io_pickle.convert_v6(train_data,train_data_pickle)
217 #train_data = io_pickle.load(train_data_pickle)
218 data = scipy.io.loadmat(train_data)
219 Noises = data['TrainNoise'] # substitution matrix
220
221 else:
222 train_data = '%s/exons_train_local.mat' % genome_info.basedir
223 #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
224 #io_pickle.convert_v6(train_data,train_data_pickle)
225 #microexon_train_data = io_pickle.load(train_data_pickle)
226 data = scipy.io.loadmat(train_data)
227
228 print 'train_data is %s' % train_data
229
230 Sequences = data['Train'] # dna sequences
231 Acceptors = data['TrainAcc'] # acceptor scores
232 Donors = data['TrainDon'] # donor scores
233 Exons = data['TrainExon'] # exon boundaries
234 Ests = data['TrainEsts'] # est sequences
235
236 #elif expt == 'validation':
237 # print('Loading validation data\n') ;
238 # if PAR.microexon,
239 # if PAR.LOCAL_ALIGN
240 # %local version
241 # load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
242 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
243 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
244 # else
245 # %global version
246 # load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
247 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
248 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
249 # end
250 # else
251 # load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
252 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
253 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
254 # end
255 #
256 # Sequences = Val ; % dna sequences
257 # Acceptors = ValAcc ; % acceptor scores
258 # Donors = ValDon ; % donor scores
259 # Exons = ValExon ; % exon boundaries
260 # Ests = ValEsts ; % est sequences
261
262
263
264 #elif expt == 'test':
265 # fprintf('Loading test data\n') ;
266 # if PAR.microexon,
267 # if PAR.LOCAL_ALIGN
268 # %local version
269 # load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
270 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
271 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
272 # else
273 # %global version
274 # load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
275 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
276 # 'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
277 # Noises = TestNoise ; % substitution matrix
278 # end
279 # else
280 # load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
281 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
282 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
283 # end
284 #
285 # Sequences = Test ; % dna sequences
286 # Acceptors = TestAcc ; % acceptor scores
287 # Donors = TestDon ; % donor scores
288 # Exons = TestExon ; % exon boundaries
289 # Ests = TestEsts ; % est sequences
290
291 # Lower all indices by one to convert matlab
292 # to python indices
293
294 Exons -= 1
295
296 return Sequences, Acceptors, Donors, Exons, Ests, Noises