git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8610 e1793c9e...
[qpalma.git] / qpalma / DataProc.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7 #import pydb
8 import Configuration as Conf
9 import tools
10 from tools.PyGff import *
11 import io_pickle
12 #import scipy.io
13 import sys
14
15 from genome_utils import load_genomic
16 from Genefinding import *
17
18 def paths_load_data(filename):
19
20 #data = io_pickle.load(filename)
21 data = cPickle.load(open(filename))
22
23 SeqInfo = data[0]
24 Exons = data[1]
25 OriginalEsts= data[2]
26 Qualities = data[3]
27 AlternativeSequences = data[4]
28
29 return SeqInfo, Exons, OriginalEsts, Qualities,\
30 AlternativeSequences
31
32
33 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
34 """
35
36 """
37 # expt can be 'training','validation' or 'test'
38 assert expt in ['training','validation','test']
39
40 dna_filename = Conf.dna_filename
41 est_filename = Conf.est_filename
42
43 pdb.set_trace()
44
45 allGenes = cPickle.load(open(dna_filename))
46
47 Sequences = []
48 Acceptors = []
49 Donors = []
50 Exons = []
51 Ests = []
52 Qualities = []
53 SplitPositions = []
54
55 rrp = RemappedReadParser('est_filename')
56
57 # Iterate over all reads
58 for line in open(est_filename):
59 line = line.strip()
60 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
61 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
62 id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
63
64 splitpos = int(splitpos)
65 length = int(length)
66 prb = [ord(elem)-50 for elem in prb]
67 cal = [ord(elem)-64 for elem in cal]
68 chastity = [ord(elem)+10 for elem in chastity]
69
70 assert len(prb) == len(seq)
71
72 currentGene = allGenes[gene_id]
73 seq = seq.lower()
74
75 #try:
76 # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
77 #except:
78 # continue
79
80 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
81 chrom = 'chr1'
82
83 # use matlab-style functions to access dna sequence
84 dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
85 currentSeq = dna
86
87 if len(currentSeq) < (currentGene.stop - currentGene.start):
88 continue
89
90 # compare both sequences
91 #assert dna == currentSeq, pdb.set_trace()
92
93 p_start = int(p_start)
94 exon_stop = int(exon_stop)
95 exon_start = int(exon_start)
96 p_stop = int(p_stop)
97
98 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
99 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
100 assert p_stop - p_start >= 36
101
102 currentExons = zeros((2,2))
103
104 currentExons[0,0] = p_start
105 currentExons[0,1] = exon_stop
106
107 currentExons[1,0] = exon_start
108 currentExons[1,1] = p_stop
109
110 # make indices relative to start pos of the current gene
111 currentExons -= currentGene.start
112
113 # determine cutting positions for sequences
114 up_cut = int(currentExons[0,0])
115 down_cut = int(currentExons[1,1])
116
117 # if we perform testing we cut a much wider region as we want to detect
118 # how good we perform on a region
119 if test:
120 up_cut = up_cut-500
121 if up_cut < 0:
122 up_cut = 0
123
124 down_cut = down_cut+500
125
126 if down_cut > len(currentSeq):
127 down_cut = len(currentSeq)
128
129 # cut out part of the sequence
130 currentSeq = currentSeq[up_cut:down_cut]
131
132 # ensure that index is correct
133 currentExons[0,1] += 1
134
135 if test:
136 currentExons -= up_cut
137 else:
138 currentExons -= currentExons[0,0]
139
140 try:
141 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
142 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
143 continue
144
145 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
146 continue
147
148 except:
149 pdb.set_trace()
150
151 Sequences.append(currentSeq)
152
153 # now we want to use interval_query to get the predicted splice scores
154 # trained on the TAIR sequence and annotation
155
156
157 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
158 num_fields = 1
159 num_intervals = 1
160
161 # fetch acceptor scores
162 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
163 gf = Genef()
164 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
165 assert num_hits <= len(currentSeq)
166
167 #print 'Acceptor hits: %d' % num_hits
168
169 pos = createIntArrayFromList([0]*num_hits)
170 indices = createIntArrayFromList([0]*num_hits)
171 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
172 gf.getResults(pos,indices,scores)
173
174 currentAcceptors = [-inf]*(len(currentSeq)+1)
175
176 for i in range(num_hits):
177 position = pos[i]
178 position -= (currentGene.start+up_cut)
179 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
180 currentAcceptors[position] = scores[i]
181
182 currentAcceptors = currentAcceptors[1:] + [-inf]
183 Acceptors.append(currentAcceptors)
184
185 del gf
186
187 # fetch donor scores
188 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
189 gf = Genef()
190 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
191 assert num_hits <= len(currentSeq)
192
193 #print 'Donor hits: %d' % num_hits
194 pos = createIntArrayFromList([0]*num_hits)
195 indices = createIntArrayFromList([0]*num_hits)
196 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
197 gf.getResults(pos,indices,scores)
198
199 currentDonors = [-inf]*(len(currentSeq))
200
201 try:
202 for i in range(1,num_hits):
203 position = pos[i]
204 position -= (currentGene.start+up_cut)
205 currentDonors[position-1] = scores[i]
206 except:
207 pdb.set_trace()
208
209 currentDonors = [-inf] + currentDonors[:-1]
210 Donors.append(currentDonors)
211
212 Exons.append(currentExons)
213 Ests.append(seq)
214 Qualities.append(prb)
215
216 SplitPositions.append(splitpos)
217
218 #if len(Sequences[-1]) == 2755:
219 # Sequences = Sequences[:-1]
220 # Acceptors = Acceptors[:-1]
221 # Donors = Donors[:-1]
222 # Exons = Exons[:-1]
223 # Ests = Ests[:-1]
224 # Qualities = Qualities[:-1]
225 # SplitPositions = SplitPositions[:-1]
226
227 #print 'found %d examples' % len(Sequences)
228
229 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
230
231
232 def paths_load_data_pickle(expt,genome_info,PAR):
233 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
234 # Load the relevant file and return the alignment data
235
236 # expt can be 'training','validation' or 'test'
237
238 assert expt in ['training','validation','test']
239
240 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
241
242 Noises = [];
243
244 if expt == 'training':
245 if PAR.microexon:
246 if PAR.LOCAL_ALIGN: # local version
247
248 train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
249 data = cPickle.load(open(train_data))
250
251 else: # global version
252 pass
253
254
255 else:
256 train_data = '%s/exons_train_local.pickle' % genome_info.basedir
257 data = cPickle.load(open(train_data))
258
259 #print 'train_data is %s' % train_data
260
261 Sequences = data['Train'] # dna sequences
262 Acceptors = data['TrainAcc'] # acceptor scores
263 Donors = data['TrainDon'] # donor scores
264 Exons = data['TrainExon'] # exon boundaries
265 Ests = data['TrainEsts'] # est sequences
266
267 # Lower all indices by one to convert matlab
268 # to python indices
269
270 Exons -= 1
271
272 return Sequences, Acceptors, Donors, Exons, Ests, Noises