+ update makefiles to fetch automatically valid Python includes and libs
[qpalma.git] / qpalma / DatasetUtils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import array
10 import cPickle
11 import numpy
12 import os
13 import os.path
14 import pdb
15
16 from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
17
18 jp = os.path.join
19
20 illumina_ga_range = (-5,40)
21 #roche_454_range =
22
23
24 def processQuality(raw_qualities,prb_offset,quality_interval,perform_checks):
25 """
26 In order to save some space we use a signed char to store the
27 qualities. Each quality element can range as follows: -128 <= elem <= 127
28 """
29
30 q_values = map(lambda x: ord(x)-prb_offset,raw_qualities)
31
32 if perform_checks:
33 for entry in q_values:
34 assert quality_interval[0] <= entry <= quality_interval[1], 'Error: Your read has invalid quality values: %d' % entry
35
36 return array.array('b',q_values)
37
38
39 def checkExons(dna,exons,readAlignment,exampleKey):
40 """
41
42 """
43
44 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
45
46 donor_elem = dna[exons[0,1]:exons[0,1]+2]
47 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
48
49 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
50 print 'invalid donor in example %d'% exampleKey
51 return False
52
53 if not ( acceptor_elem == 'ag' ):
54 print 'invalid acceptor in example %d'% exampleKey
55 return False
56
57 read = unbracket_seq(readAlignment)
58 read = read.replace('-','')
59
60 assert len(fetched_dna_subseq) == len(read), pdb.set_trace()
61
62 return True
63
64
65 def generateTrainingDataset(settings):
66 """
67 This function creates a training dataset.
68
69
70 Create lower case original ("bracketed") reads
71
72 """
73
74 dataset = {}
75
76 half_window_size = settings['half_window_size']
77
78 # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
79 if settings['platform'] == 'IGA':
80 quality_interval = illumina_ga_range
81
82 instance_counter = 0
83
84 accessWrapper = DataAccessWrapper(settings)
85 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
86
87 for line in open(settings['training_reads_fn']):
88 line = line.strip()
89 if line.startswith('#') or line == '':
90 continue
91
92 if instance_counter > 0 and instance_counter % 5000 == 0:
93 print 'processed %d examples' % instance_counter
94
95 slist = line.split()
96
97 id = int(slist[0])
98 chromo = int(slist[1])
99
100 # for the time being we only consider chromosomes 1-5
101 if not chromo in settings['allowed_fragments']:
102 continue
103
104 # convert D/P strand info to +/-
105 strand = slist[2]
106
107 if strand in ['-','+']:
108 pass
109 elif strand in ['D','P']:
110 strand = ['-','+'][strand == 'D']
111 else:
112 print 'Error strand information has to be either +/- or D/P'
113 sys.exit(1)
114
115 seqBeginning = int(slist[3])
116 seqEnd = int(slist[4])
117
118 readAlignment = slist[5]
119 prb = processQuality(slist[6],settings['prb_offset'],quality_interval,settings['perform_checks'])
120
121 exons = numpy.mat([0,0,0,0]).reshape((2,2))
122
123 exons[0,0] = int(slist[7])
124 exons[0,1] = int(slist[8])
125 exons[1,0] = int(slist[9])
126 exons[1,1] = int(slist[10])
127
128 dna = seqInfo.get_seq_and_scores(chromo,strand,seqBeginning,seqEnd,only_seq=True)
129 relative_exons = exons - seqBeginning
130
131 assert checkExons(dna,relative_exons,readAlignment,id)
132
133 currentSeqInfo = (id,chromo,strand,seqBeginning,seqEnd)
134
135 dataset[id] = (currentSeqInfo,readAlignment,[prb],exons)
136
137 # dataset.setdefault(id, []).append()
138
139 saveData('training',dataset,settings)
140
141
142 def addToDataset(map_file,dataset,settings):
143 assert os.path.exists(map_file), 'Error: Can not find map file'
144
145 # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
146 if settings['platform'] == 'IGA':
147 quality_interval = illumina_ga_range
148
149 # this means that around each vmatch hit we cut out a window of 1500 bases
150 # each up-/downstream.
151 half_window_size = settings['half_window_size']
152
153 instance_counter = 0
154
155 accessWrapper = DataAccessWrapper(settings)
156 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
157
158 for line in open(map_file):
159 line = line.strip()
160 if line.startswith('#') or line == '':
161 continue
162
163 if instance_counter > 0 and instance_counter % 5000 == 0:
164 print 'processed %d examples' % instance_counter
165
166 slist = line.split()
167
168 id = int(slist[0])
169 chromo = int(slist[1])
170 pos = int(slist[2])
171
172 # for the time being we only consider chromosomes 1-5
173 if not chromo in settings['allowed_fragments']:
174 continue
175
176 # convert D/P strand info to +/-
177 strand = ['-','+'][slist[3] == 'D']
178
179 # QPalma uses lowercase characters
180 bracket_seq = slist[4].lower()
181 read_seq = unbracket_seq(bracket_seq)
182
183 # in prediction we do not have any bracketed reads anymore
184 assert not '[' in read_seq and not ']' in read_seq
185
186 # we use an area of +/- `self.half_window_size` nucleotides around the seed position
187 if pos > half_window_size+1:
188 us_offset = half_window_size
189 else:
190 us_offset = pos - 1
191
192 if pos+half_window_size < seqInfo.getFragmentSize(chromo):
193 ds_offset = half_window_size
194 else:
195 ds_offset = seqInfo.getFragmentSize(chromo)-pos-1
196
197 genomicSeq_start = pos - us_offset
198 genomicSeq_stop = pos + ds_offset
199
200 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
201
202 prb = processQuality(slist[5],settings['prb_offset'],quality_interval,settings['perform_checks'])
203
204 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
205
206 # As one single read can have several vmatch matches we store all these
207 # matches under the unique id of the read
208 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,[prb]))
209
210 instance_counter += 1
211
212 print 'Total examples processed: %d' % instance_counter
213
214 return dataset
215
216
217 def generatePredictionDataset(settings):
218 """
219 This function is responsible for the prediction dataset generation for the matches of
220 the second vmatch run and the heuristic-filtered matches of the first run.
221
222 An example in our dataset consists of:
223
224 - information on the original read:
225 * id
226 * nucleotide sequence
227 * quality vectors (at the moment: prb and chastity)
228
229 - information on the target dna sequence:
230 * chromosome
231 * strand
232 * fragment start
233 * fragment end positions
234
235 this information is stored in tuples which are stored then in a dictionary
236 indexed by the reads unique id.
237
238 CAUTION: The dictionary stores a list of the above defined tuples under each
239 'id'. This is because one read can be found several times on the genome.
240 """
241
242 dataset = {}
243
244 map_file = settings['spliced_reads_fn']
245 dataset = addToDataset(map_file,dataset,settings)
246
247 map_file = jp(settings['approximation_dir'],'map.vm.spliced')
248 dataset = addToDataset(map_file,dataset,settings)
249
250 saveData('prediction',dataset,settings)
251
252
253 def saveData(prefix,dataset,settings):
254 """
255 """
256
257 if prefix == 'prediction':
258 dataset_fn = settings['prediction_dataset_fn']
259 dataset_keys_fn = settings['prediction_dataset_keys_fn']
260 elif prefix == 'training':
261 dataset_fn = settings['training_dataset_fn']
262 dataset_keys_fn = settings['training_dataset_keys_fn']
263 else:
264 assert False
265
266 #assert not os.path.exists(dataset_fn), 'The data file already exists!'
267 #assert not os.path.exists(dataset_keys_fn), 'The data keys file already exists!'
268
269 # saving new dataset and single keys as well
270 cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
271 cPickle.dump(dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)