+ refactored code further
[qpalma.git] / qpalma / DataProc.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7 #import pydb
8 import Configuration as Conf
9 import tools
10 from tools.PyGff import *
11 import io_pickle
12 import scipy.io
13 import sys
14
15 from genome_utils import load_genomic
16 from Genefinding import *
17
18 def paths_load_data(filename,expt,genome_info,PAR,test=False):
19
20 #data = io_pickle.load(filename)
21 data = cPickle.load(open(filename))
22
23 Sequences = data[0]
24 Acceptors = data[1]
25 Donors = data[2]
26 Exons = data[3]
27 Ests = data[4]
28 OriginalEsts= data[5]
29 Qualities = data[6]
30 UpCut = data[7]
31 StartPos = data[8]
32 AlternativeSequences = data[9]
33
34 return Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
35 UpCut, StartPos, AlternativeSequences
36
37
38 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
39 """
40
41 """
42 # expt can be 'training','validation' or 'test'
43 assert expt in ['training','validation','test']
44
45 dna_filename = Conf.dna_filename
46 est_filename = Conf.est_filename
47
48 pdb.set_trace()
49
50 allGenes = cPickle.load(open(dna_filename))
51
52 Sequences = []
53 Acceptors = []
54 Donors = []
55 Exons = []
56 Ests = []
57 Qualities = []
58 SplitPositions = []
59
60 rrp = RemappedReadParser('est_filename')
61
62 # Iterate over all reads
63 for line in open(est_filename):
64 line = line.strip()
65 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
66 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
67 id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
68
69 splitpos = int(splitpos)
70 length = int(length)
71 prb = [ord(elem)-50 for elem in prb]
72 cal = [ord(elem)-64 for elem in cal]
73 chastity = [ord(elem)+10 for elem in chastity]
74
75 assert len(prb) == len(seq)
76
77 currentGene = allGenes[gene_id]
78 seq = seq.lower()
79
80 #try:
81 # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
82 #except:
83 # continue
84
85 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
86 chrom = 'chr1'
87
88 # use matlab-style functions to access dna sequence
89 dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
90 currentSeq = dna
91
92 if len(currentSeq) < (currentGene.stop - currentGene.start):
93 continue
94
95 # compare both sequences
96 #assert dna == currentSeq, pdb.set_trace()
97
98 p_start = int(p_start)
99 exon_stop = int(exon_stop)
100 exon_start = int(exon_start)
101 p_stop = int(p_stop)
102
103 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
104 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
105 assert p_stop - p_start >= 36
106
107 currentExons = zeros((2,2))
108
109 currentExons[0,0] = p_start
110 currentExons[0,1] = exon_stop
111
112 currentExons[1,0] = exon_start
113 currentExons[1,1] = p_stop
114
115 # make indices relative to start pos of the current gene
116 currentExons -= currentGene.start
117
118 # determine cutting positions for sequences
119 up_cut = int(currentExons[0,0])
120 down_cut = int(currentExons[1,1])
121
122 # if we perform testing we cut a much wider region as we want to detect
123 # how good we perform on a region
124 if test:
125 up_cut = up_cut-500
126 if up_cut < 0:
127 up_cut = 0
128
129 down_cut = down_cut+500
130
131 if down_cut > len(currentSeq):
132 down_cut = len(currentSeq)
133
134 # cut out part of the sequence
135 currentSeq = currentSeq[up_cut:down_cut]
136
137 # ensure that index is correct
138 currentExons[0,1] += 1
139
140 if test:
141 currentExons -= up_cut
142 else:
143 currentExons -= currentExons[0,0]
144
145 try:
146 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
147 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
148 continue
149
150 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
151 continue
152
153 except:
154 pdb.set_trace()
155
156 Sequences.append(currentSeq)
157
158 # now we want to use interval_query to get the predicted splice scores
159 # trained on the TAIR sequence and annotation
160
161
162 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
163 num_fields = 1
164 num_intervals = 1
165
166 # fetch acceptor scores
167 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
168 gf = Genef()
169 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
170 assert num_hits <= len(currentSeq)
171
172 #print 'Acceptor hits: %d' % num_hits
173
174 pos = createIntArrayFromList([0]*num_hits)
175 indices = createIntArrayFromList([0]*num_hits)
176 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
177 gf.getResults(pos,indices,scores)
178
179 currentAcceptors = [-inf]*(len(currentSeq)+1)
180
181 for i in range(num_hits):
182 position = pos[i]
183 position -= (currentGene.start+up_cut)
184 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
185 currentAcceptors[position] = scores[i]
186
187 currentAcceptors = currentAcceptors[1:] + [-inf]
188 Acceptors.append(currentAcceptors)
189
190 del gf
191
192 # fetch donor scores
193 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
194 gf = Genef()
195 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
196 assert num_hits <= len(currentSeq)
197
198 #print 'Donor hits: %d' % num_hits
199 pos = createIntArrayFromList([0]*num_hits)
200 indices = createIntArrayFromList([0]*num_hits)
201 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
202 gf.getResults(pos,indices,scores)
203
204 currentDonors = [-inf]*(len(currentSeq))
205
206 try:
207 for i in range(1,num_hits):
208 position = pos[i]
209 position -= (currentGene.start+up_cut)
210 currentDonors[position-1] = scores[i]
211 except:
212 pdb.set_trace()
213
214 currentDonors = [-inf] + currentDonors[:-1]
215 Donors.append(currentDonors)
216
217 Exons.append(currentExons)
218 Ests.append(seq)
219 Qualities.append(prb)
220
221 SplitPositions.append(splitpos)
222
223 #if len(Sequences[-1]) == 2755:
224 # Sequences = Sequences[:-1]
225 # Acceptors = Acceptors[:-1]
226 # Donors = Donors[:-1]
227 # Exons = Exons[:-1]
228 # Ests = Ests[:-1]
229 # Qualities = Qualities[:-1]
230 # SplitPositions = SplitPositions[:-1]
231
232 #print 'found %d examples' % len(Sequences)
233
234 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
235
236
237 def paths_load_data_pickle(expt,genome_info,PAR):
238 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
239 # Load the relevant file and return the alignment data
240
241 # expt can be 'training','validation' or 'test'
242
243 assert expt in ['training','validation','test']
244
245 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
246
247 Noises = [];
248
249 if expt == 'training':
250 if PAR.microexon:
251 if PAR.LOCAL_ALIGN: # local version
252
253 train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
254 data = cPickle.load(open(train_data))
255
256 else: # global version
257 pass
258
259
260 else:
261 train_data = '%s/exons_train_local.pickle' % genome_info.basedir
262 data = cPickle.load(open(train_data))
263
264 #print 'train_data is %s' % train_data
265
266 Sequences = data['Train'] # dna sequences
267 Acceptors = data['TrainAcc'] # acceptor scores
268 Donors = data['TrainDon'] # donor scores
269 Exons = data['TrainExon'] # exon boundaries
270 Ests = data['TrainEsts'] # est sequences
271
272 # Lower all indices by one to convert matlab
273 # to python indices
274
275 Exons -= 1
276
277 return Sequences, Acceptors, Donors, Exons, Ests, Noises