+ modified filtering of reads
[qpalma.git] / qpalma / DataProc.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7 import Configuration as Conf
8 import tools
9 from tools.PyGff import *
10 import io_pickle
11 import scipy.io
12 import sys
13
14 from genome_utils import load_genomic
15
16 def paths_load_data(filename,expt,genome_info,PAR,test=False):
17
18 data = io_pickle.load(filename)
19
20 Sequences = data['Sequences']
21 Acceptors = data['Acceptors']
22 Donors = data['Donors']
23 Exons = data['Exons']
24 Ests = data['Ests']
25 Qualities = data['Qualities']
26 SplitPositions = data['SplitPositions']
27
28 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
29
30 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
31 """
32
33 """
34 # expt can be 'training','validation' or 'test'
35 assert expt in ['training','validation','test']
36
37 dna_filename = Conf.dna_filename
38 est_filename = Conf.est_filename
39
40 pdb.set_trace()
41
42 allGenes = cPickle.load(open(dna_filename))
43
44 Sequences = []
45 Acceptors = []
46 Donors = []
47 Exons = []
48 Ests = []
49 Qualities = []
50 SplitPositions = []
51
52 rrp = RemappedReadParser('est_filename')
53
54 # Iterate over all reads
55 for line in open(est_filename):
56 line = line.strip()
57 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
58 #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
59 id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
60
61 splitpos = int(splitpos)
62 length = int(length)
63 prb = [ord(elem)-50 for elem in prb]
64 cal = [ord(elem)-64 for elem in cal]
65 chastity = [ord(elem)+10 for elem in chastity]
66
67 assert len(prb) == len(seq)
68
69 currentGene = allGenes[gene_id]
70 seq = seq.lower()
71
72 #try:
73 # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
74 #except:
75 # continue
76
77 genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
78 chrom = 'chr1'
79
80 # use matlab-style functions to access dna sequence
81 dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
82 currentSeq = dna
83
84 if len(currentSeq) < (currentGene.stop - currentGene.start):
85 continue
86
87 # compare both sequences
88 #assert dna == currentSeq, pdb.set_trace()
89
90 p_start = int(p_start)
91 exon_stop = int(exon_stop)
92 exon_start = int(exon_start)
93 p_stop = int(p_stop)
94
95 assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
96 assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
97 assert p_stop - p_start >= 36
98
99 currentExons = zeros((2,2))
100
101 currentExons[0,0] = p_start
102 currentExons[0,1] = exon_stop
103
104 currentExons[1,0] = exon_start
105 currentExons[1,1] = p_stop
106
107 # make indices relative to start pos of the current gene
108 currentExons -= currentGene.start
109
110 # determine cutting positions for sequences
111 up_cut = int(currentExons[0,0])
112 down_cut = int(currentExons[1,1])
113
114 # if we perform testing we cut a much wider region as we want to detect
115 # how good we perform on a region
116 if test:
117 up_cut = up_cut-500
118 if up_cut < 0:
119 up_cut = 0
120
121 down_cut = down_cut+500
122
123 if down_cut > len(currentSeq):
124 down_cut = len(currentSeq)
125
126 # cut out part of the sequence
127 currentSeq = currentSeq[up_cut:down_cut]
128
129 # ensure that index is correct
130 currentExons[0,1] += 1
131
132 if test:
133 currentExons -= up_cut
134 else:
135 currentExons -= currentExons[0,0]
136
137 try:
138 if not (currentSeq[int(currentExons[0,1])] == 'g' and\
139 currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
140 continue
141
142 if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
143 continue
144
145 except:
146 pdb.set_trace()
147
148 Sequences.append(currentSeq)
149
150 # now we want to use interval_query to get the predicted splice scores
151 # trained on the TAIR sequence and annotation
152
153 from Genefinding import *
154
155 interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
156 num_fields = 1
157 num_intervals = 1
158
159 # fetch acceptor scores
160 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
161 gf = Genef()
162 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
163 assert num_hits <= len(currentSeq)
164
165 #print 'Acceptor hits: %d' % num_hits
166
167 pos = createIntArrayFromList([0]*num_hits)
168 indices = createIntArrayFromList([0]*num_hits)
169 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
170 gf.getResults(pos,indices,scores)
171
172 currentAcceptors = [-inf]*(len(currentSeq)+1)
173
174 for i in range(num_hits):
175 position = pos[i]
176 position -= (currentGene.start+up_cut)
177 assert 0 <= position < len(currentSeq)+1, 'position index wrong'
178 currentAcceptors[position] = scores[i]
179
180 currentAcceptors = currentAcceptors[1:] + [-inf]
181 Acceptors.append(currentAcceptors)
182
183 del gf
184
185 # fetch donor scores
186 fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
187 gf = Genef()
188 num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
189 assert num_hits <= len(currentSeq)
190
191 #print 'Donor hits: %d' % num_hits
192 pos = createIntArrayFromList([0]*num_hits)
193 indices = createIntArrayFromList([0]*num_hits)
194 scores = createDoubleArrayFromList([0]*num_hits*num_fields)
195 gf.getResults(pos,indices,scores)
196
197 currentDonors = [-inf]*(len(currentSeq))
198
199 try:
200 for i in range(1,num_hits):
201 position = pos[i]
202 position -= (currentGene.start+up_cut)
203 currentDonors[position-1] = scores[i]
204 except:
205 pdb.set_trace()
206
207 currentDonors = [-inf] + currentDonors[:-1]
208 Donors.append(currentDonors)
209
210 Exons.append(currentExons)
211 Ests.append(seq)
212 Qualities.append(prb)
213
214 SplitPositions.append(splitpos)
215
216 #if len(Sequences[-1]) == 2755:
217 # Sequences = Sequences[:-1]
218 # Acceptors = Acceptors[:-1]
219 # Donors = Donors[:-1]
220 # Exons = Exons[:-1]
221 # Ests = Ests[:-1]
222 # Qualities = Qualities[:-1]
223 # SplitPositions = SplitPositions[:-1]
224
225 #print 'found %d examples' % len(Sequences)
226
227 return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
228
229
230 def paths_load_data_pickle(expt,genome_info,PAR):
231 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
232 # Load the relevant file and return the alignment data
233
234 # expt can be 'training','validation' or 'test'
235
236 assert expt in ['training','validation','test']
237
238 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
239
240 Noises = [];
241
242 if expt == 'training':
243 if PAR.microexon:
244 if PAR.LOCAL_ALIGN: # local version
245
246 train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
247 data = cPickle.load(open(train_data))
248
249 else: # global version
250 pass
251
252
253 else:
254 train_data = '%s/exons_train_local.pickle' % genome_info.basedir
255 data = cPickle.load(open(train_data))
256
257 #print 'train_data is %s' % train_data
258
259 Sequences = data['Train'] # dna sequences
260 Acceptors = data['TrainAcc'] # acceptor scores
261 Donors = data['TrainDon'] # donor scores
262 Exons = data['TrainExon'] # exon boundaries
263 Ests = data['TrainEsts'] # est sequences
264
265 # Lower all indices by one to convert matlab
266 # to python indices
267
268 Exons -= 1
269
270 return Sequences, Acceptors, Donors, Exons, Ests, Noises