4116e12f8433df227be2bf84659ab933014da1a5
[qpalma.git] / qpalma / DatasetUtils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import array
10 import cPickle
11 import os
12 import os.path
13 import pdb
14
15 from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
16
17 jp = os.path.join
18
19 illumina_ga_range = (-5,40)
20 #roche_454_range =
21
22
23 def processQuality(raw_qualities,prb_offset,quality_interval,perform_checks):
24 """
25 In order to save some space we use a signed char to store the
26 qualities. Each quality element can range as follows: -128 <= elem <= 127
27 """
28
29 q_values = map(lambda x: ord(x)-prb_offset,raw_qualities)
30
31 if perform_checks:
32 for entry in q_values:
33 assert quality_interval[0] <= entry <= quality_interval[1], 'Error: Your read has invalid quality values: %d' % entry
34
35 return array.array('b',q_values)
36
37
38 def checkExons(dna,exons,readAlignment,exampleKey):
39 """
40
41 """
42
43 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
44
45 donor_elem = dna[exons[0,1]:exons[0,1]+2]
46 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
47
48 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
49 print 'invalid donor in example %d'% exampleKey
50 return False
51
52 if not ( acceptor_elem == 'ag' ):
53 print 'invalid acceptor in example %d'% exampleKey
54 return False
55
56 read = unbracket_seq(readAlignment)
57 read = read.replace('-','')
58
59 assert len(fetched_dna_subseq) == len(read), pdb.set_trace()
60
61 return True
62
63
64 def generateTrainingDataset(settings):
65 """
66 This function creates a training dataset.
67
68
69 Create lower case original ("bracketed") reads
70
71 """
72
73 dataset = []
74
75 half_window_size = settings['half_window_size']
76
77 instance_counter = 0
78
79 accessWrapper = DataAccessWrapper(settings)
80 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
81
82 for line in open(settings['training_data_fn']):
83 line = line.strip()
84 if line.startswith('#') or line == '':
85 continue
86
87 if instance_counter > 0 and instance_counter % 5000 == 0:
88 print 'processed %d examples' % instance_counter
89
90 slist = line.split()
91
92 id = int(slist[0])
93 chromo = int(slist[1])
94
95 # for the time being we only consider chromosomes 1-5
96 if not chromo in settings['allowed_fragments']:
97 continue
98
99 # convert D/P strand info to +/-
100 strand = ['-','+'][slist[2] == 'D']
101
102 seqBeginning = int(slist[3])
103 seqEnd = int(slist[4])
104
105 readAlignment = slist[5]
106 prb = processQuality(slist[6],prb_offset,quality_intervals,perform_checks)
107
108 exons = numpy.mat([0,0,0,0]).reshape((2,2))
109
110 exons[0,0] = int(slist[7])
111 exons[0,1] = int(slist[8])
112 exons[1,0] = int(slist[9])
113 exons[1,1] = int(slist[10])
114
115 dna = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut,only_seq=True)
116 relative_exons = exons - up_cut
117
118 assert checkExons(dna,relative_exons,readAlignment,id)
119
120 currentSeqInfo = (id,chromo)
121
122 dataset.setdefault(id, []).append((currentSeqInfo,readAlignment,[prb],exons))
123
124 saveData('training',dataset,settings)
125
126
127 def addToDataset(map_file,dataset,settings):
128 assert os.path.exists(map_file), 'Error: Can not find map file'
129
130 perform_checks = settings['perform_checks']
131
132 prb_offset = settings['prb_offset']
133
134 # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
135 if settings['platform'] == 'IGA':
136 quality_interval = illumina_ga_range
137
138 # this means that around each vmatch hit we cut out a window of 1500 bases
139 # each up-/downstream.
140 half_window_size = settings['half_window_size']
141
142 instance_counter = 0
143
144 accessWrapper = DataAccessWrapper(settings)
145 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
146
147 for line in open(map_file):
148 line = line.strip()
149 if line.startswith('#') or line == '':
150 continue
151
152 if instance_counter > 0 and instance_counter % 5000 == 0:
153 print 'processed %d examples' % instance_counter
154
155 slist = line.split()
156
157 id = int(slist[0])
158 chromo = int(slist[1])
159 pos = int(slist[2])
160
161 # for the time being we only consider chromosomes 1-5
162 if not chromo in settings['allowed_fragments']:
163 continue
164
165 # convert D/P strand info to +/-
166 strand = ['-','+'][slist[3] == 'D']
167
168 # QPalma uses lowercase characters
169 bracket_seq = slist[4].lower()
170 read_seq = unbracket_seq(bracket_seq)
171
172 # in prediction we do not have any bracketed reads anymore
173 assert not '[' in read_seq and not ']' in read_seq
174
175 # we use an area of +/- `self.half_window_size` nucleotides around the seed position
176 if pos > half_window_size+1:
177 us_offset = half_window_size
178 else:
179 us_offset = pos - 1
180
181 if pos+half_window_size < seqInfo.getFragmentSize(chromo):
182 ds_offset = half_window_size
183 else:
184 ds_offset = seqInfo.getFragmentSize(chromo)-pos-1
185
186 genomicSeq_start = pos - us_offset
187 genomicSeq_stop = pos + ds_offset
188
189 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
190
191 prb = processQuality(slist[5],prb_offset,quality_interval,perform_checks)
192
193 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
194
195 # As one single read can have several vmatch matches we store all these
196 # matches under the unique id of the read
197 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,[prb]))
198
199 instance_counter += 1
200
201 print 'Total examples processed: %d' % instance_counter
202
203 return dataset
204
205
206 def generatePredictionDataset(settings):
207 """
208 This function is responsible for the prediction dataset generation for the matches of
209 the second vmatch run and the heuristic-filtered matches of the first run.
210
211 An example in our dataset consists of:
212
213 - information on the original read:
214 * id
215 * nucleotide sequence
216 * quality vectors (at the moment: prb and chastity)
217
218 - information on the target dna sequence:
219 * chromosome
220 * strand
221 * fragment start
222 * fragment end positions
223
224 this information is stored in tuples which are stored then in a dictionary
225 indexed by the reads unique id.
226
227 CAUTION: The dictionary stores a list of the above defined tuples under each
228 'id'. This is because one read can be found several times on the genome.
229 """
230
231 dataset = {}
232
233 map_file = settings['spliced_reads_fn']
234 dataset = addToDataset(map_file,dataset,settings)
235
236 map_file = jp(settings['approximation_dir'],'map.vm.spliced')
237 dataset = addToDataset(map_file,dataset,settings)
238
239 saveData('prediction',dataset,settings)
240
241
242 def saveData(prefix,dataset,settings):
243 """
244 """
245
246 if prefix == 'prediction':
247 dataset_fn = settings['prediction_dataset_fn']
248 dataset_keys_fn = settings['prediction_dataset_keys_fn']
249 elif prefix == 'training':
250 dataset_fn = settings['training_dataset_fn']
251 dataset_keys_fn = settings['training_dataset_keys_fn']
252 else:
253 assert False
254
255 #assert not os.path.exists(dataset_fn), 'The data file already exists!'
256 #assert not os.path.exists(dataset_keys_fn), 'The data keys file already exists!'
257
258 # saving new dataset and single keys as well
259 cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
260 cPickle.dump(dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)